fix: end-to-end certificate lifecycle bugs + integration test environment

Fixes 12 production bugs preventing the full issuance→deployment flow
from working with ACME (Pebble/Let's Encrypt) and step-ca issuers:

ACME connector (acme.go):
- Save orderURI before WaitOrder overwrites it (Go crypto/acme bug)
- Add CreateOrderCert fallback via WaitOrder+FetchCert
- Remove defer-reset in ValidateConfig that caused nil pointer panic
- Add Insecure TLS option for self-signed ACME servers (Pebble)

step-ca connector (stepca.go, jwe.go):
- Real JWE provisioner key loading + decryption (was using ephemeral keys)
- Fix JWT audience (/1.0/sign), sha claim (key fingerprint), kid header
- Custom root CA trust via RootCertPath config
- Remove hardcoded 90-day validity default (let step-ca decide)

NGINX target connector (nginx.go):
- Use sh -c for validate/reload commands (shell interpretation)
- Use filepath.Dir instead of fragile string slicing
- Add private key file writing (agent-mode keys were never deployed)
- Make chain_path write conditional

Server/service layer:
- TriggerRenewalWithActor now creates actual Job records (was no-op)
- createDeploymentJobs falls back to DB query when cert.TargetIDs empty
- ProcessPendingJobs skips agent-routed deployment jobs
- Agent cert pickup path parsing: len(parts)<4 → len(parts)<3
- Health/ready/auth-info endpoints bypass auth middleware
- Write timeout 15s→120s for ACME issuance
- Cert fingerprint computed on CSR submission

Integration test environment (deploy/test/):
- 10-phase test script covering Local CA, ACME, step-ca, revocation,
  discovery, renewal, and API spot checks
- Docker Compose with 7 containers (server, agent, postgres, nginx,
  pebble, challtestsrv, step-ca) on isolated network
- TLS verification checks SAN (not just Subject CN) for modern CA compat

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shankar0123
2026-04-02 17:02:20 -04:00
parent 2238f28610
commit b059ec930f
19 changed files with 2102 additions and 84 deletions
+9 -8
View File
@@ -178,14 +178,15 @@ func (s *AgentService) SubmitCSR(ctx context.Context, agentID string, certID str
}
version := &domain.CertificateVersion{
ID: generateID("certver"),
CertificateID: certID,
SerialNumber: result.Serial,
NotBefore: result.NotBefore,
NotAfter: result.NotAfter,
PEMChain: result.CertPEM + "\n" + result.ChainPEM,
CSRPEM: string(csrPEM),
CreatedAt: time.Now(),
ID: generateID("certver"),
CertificateID: certID,
SerialNumber: result.Serial,
NotBefore: result.NotBefore,
NotAfter: result.NotAfter,
FingerprintSHA256: computeCertFingerprint(result.CertPEM),
PEMChain: result.CertPEM + "\n" + result.ChainPEM,
CSRPEM: string(csrPEM),
CreatedAt: time.Now(),
}
if err := s.certRepo.CreateVersion(ctx, version); err != nil {
+53
View File
@@ -14,10 +14,12 @@ import (
type CertificateService struct {
certRepo repository.CertificateRepository
targetRepo repository.TargetRepository
jobRepo repository.JobRepository
policyService *PolicyService
auditService *AuditService
revSvc *RevocationSvc
caSvc *CAOperationsSvc
keygenMode string
}
// NewCertificateService creates a new certificate service.
@@ -48,6 +50,16 @@ func (s *CertificateService) SetTargetRepo(repo repository.TargetRepository) {
s.targetRepo = repo
}
// SetJobRepo sets the job repository for creating renewal/issuance jobs.
func (s *CertificateService) SetJobRepo(repo repository.JobRepository) {
s.jobRepo = repo
}
// SetKeygenMode sets the key generation mode (agent or server).
func (s *CertificateService) SetKeygenMode(mode string) {
s.keygenMode = mode
}
// List returns a paginated list of certificates matching the filter.
func (s *CertificateService) List(ctx context.Context, filter *repository.CertificateFilter) ([]*domain.ManagedCertificate, int, error) {
certs, total, err := s.certRepo.List(ctx, filter)
@@ -195,6 +207,8 @@ func (s *CertificateService) GetVersions(ctx context.Context, certID string) ([]
}
// TriggerRenewalWithActor initiates a renewal job if the certificate is eligible.
// Creates a Renewal job (or Issuance for new certs) so the scheduler's job processor
// can pick it up and route it through the issuer connector.
func (s *CertificateService) TriggerRenewalWithActor(ctx context.Context, certID string, actor string) error {
cert, err := s.certRepo.Get(ctx, certID)
if err != nil {
@@ -220,6 +234,45 @@ func (s *CertificateService) TriggerRenewalWithActor(ctx context.Context, certID
return fmt.Errorf("failed to update certificate status: %w", err)
}
// Create a renewal job so the job processor can pick it up.
// In agent keygen mode, the job starts as AwaitingCSR so the agent
// generates the key pair and submits a CSR. In server mode, it starts as Pending.
if s.jobRepo != nil {
jobStatus := domain.JobStatusPending
if s.keygenMode == "agent" {
jobStatus = domain.JobStatusAwaitingCSR
}
// Determine job type: Issuance for certs that have never been issued,
// Renewal for certs that already have a version.
jobType := domain.JobTypeRenewal
if cert.ExpiresAt.IsZero() || cert.ExpiresAt.Year() < 2000 {
jobType = domain.JobTypeIssuance
}
job := &domain.Job{
ID: generateID("job"),
CertificateID: cert.ID,
Type: jobType,
Status: jobStatus,
MaxAttempts: 3,
ScheduledAt: time.Now(),
CreatedAt: time.Now(),
}
if err := s.jobRepo.Create(ctx, job); err != nil {
slog.Error("failed to create renewal job", "cert_id", cert.ID, "error", err)
return fmt.Errorf("failed to create renewal job: %w", err)
}
slog.Info("created renewal job via API trigger",
"job_id", job.ID,
"cert_id", cert.ID,
"job_type", string(jobType),
"job_status", string(jobStatus),
"keygen_mode", s.keygenMode)
}
// Record audit event
if err := s.auditService.RecordEvent(ctx, actor, domain.ActorTypeUser,
"renewal_triggered", "certificate", certID,
+10
View File
@@ -54,6 +54,16 @@ func (s *JobService) ProcessPendingJobs(ctx context.Context) error {
// Process each job
for _, job := range pendingJobs {
// Skip deployment jobs that have an agent_id — those are meant for agent
// pickup via GetPendingWork(), not server-side processing. The server should
// only process deployment jobs without an agent (legacy/serverless targets).
if job.Type == domain.JobTypeDeployment && job.AgentID != nil && *job.AgentID != "" {
s.logger.Debug("skipping agent-routed deployment job",
"job_id", job.ID,
"agent_id", *job.AgentID)
continue
}
if err := s.processJob(ctx, job); err != nil {
s.logger.Error("failed to process job",
"job_id", job.ID,
+42 -13
View File
@@ -636,23 +636,50 @@ func (s *RenewalService) CompleteAgentCSRRenewal(ctx context.Context, job *domai
}
// createDeploymentJobs creates pending deployment jobs for each target associated with a cert.
// If cert.TargetIDs is empty (common — the repository doesn't populate this field),
// falls back to querying certificate_target_mappings via targetRepo.ListByCertificate.
func (s *RenewalService) createDeploymentJobs(ctx context.Context, cert *domain.ManagedCertificate) {
if len(cert.TargetIDs) == 0 {
// Resolve targets: prefer in-memory TargetIDs, fall back to DB query
type targetInfo struct {
id string
agentID string
}
var targets []targetInfo
if len(cert.TargetIDs) > 0 {
// TargetIDs populated (e.g. from test or manual wiring)
for _, tid := range cert.TargetIDs {
ti := targetInfo{id: tid}
if s.targetRepo != nil {
if target, err := s.targetRepo.Get(ctx, tid); err == nil && target.AgentID != "" {
ti.agentID = target.AgentID
}
}
targets = append(targets, ti)
}
} else if s.targetRepo != nil {
// TargetIDs empty — query certificate_target_mappings via repository
dbTargets, err := s.targetRepo.ListByCertificate(ctx, cert.ID)
if err != nil {
slog.Error("failed to query targets for certificate", "cert_id", cert.ID, "error", err)
return
}
for _, t := range dbTargets {
targets = append(targets, targetInfo{id: t.ID, agentID: t.AgentID})
}
}
if len(targets) == 0 {
slog.Debug("no targets found for certificate, skipping deployment", "cert_id", cert.ID)
return
}
for _, targetID := range cert.TargetIDs {
tid := targetID
// Resolve agent_id from target for job routing
for _, t := range targets {
tid := t.id
var agentIDPtr *string
if s.targetRepo != nil {
target, err := s.targetRepo.Get(ctx, tid)
if err != nil {
slog.Warn("failed to resolve agent for deployment job", "target_id", tid, "error", err)
} else if target.AgentID != "" {
agentID := target.AgentID
agentIDPtr = &agentID
}
if t.agentID != "" {
aid := t.agentID
agentIDPtr = &aid
}
deployJob := &domain.Job{
@@ -667,7 +694,9 @@ func (s *RenewalService) createDeploymentJobs(ctx context.Context, cert *domain.
CreatedAt: time.Now(),
}
if err := s.jobRepo.Create(ctx, deployJob); err != nil {
slog.Error("failed to create deployment job for target", "target_id", targetID, "error", err)
slog.Error("failed to create deployment job for target", "target_id", tid, "cert_id", cert.ID, "error", err)
} else {
slog.Info("created deployment job", "job_id", deployJob.ID, "cert_id", cert.ID, "target_id", tid, "agent_id", t.agentID)
}
}
}