certctl/internal/service/crl_cache.go

package service

import (
	"context"
	"crypto/x509"
	"errors"
	"fmt"
	"log/slog"
	"sync"
	"time"

	"github.com/shankar0123/certctl/internal/domain"
	"github.com/shankar0123/certctl/internal/repository"
)

// CRLCacheService is the read-through + scheduler-driven cache layer
// for pre-generated CRLs. The HTTP handler at
// /.well-known/pki/crl/{issuer_id} reads via Get; the
// scheduler.crlGenerationLoop drives RegenerateAll on a tick.
//
// Bundle CRL/OCSP-Responder Phase 3.
//
// Concurrency model:
//
//   - The cache row is the source of truth (one row per issuer).
//   - Get returns the cached row when fresh; on miss / staleness it
//     calls regenerateOne behind a singleflight gate keyed by issuer
//     ID so concurrent miss requests for the same issuer collapse to
//     a single underlying generation call.
//   - RegenerateAll iterates every issuer in the registry, calling
//     regenerateOne for each. Per-issuer failures are logged + audited
//     via crl_generation_events; one bad issuer does not stop the
//     others.
//   - The CA-side CRL generation (caSvc.GenerateDERCRL → issuer
//     connector.GenerateCRL) is unchanged. This service is additive:
//     it persists results, surfaces them via Get, and tracks events.
type CRLCacheService struct {
	cacheRepo repository.CRLCacheRepository
	caSvc     *CAOperationsSvc
	registry  *IssuerRegistry
	logger    *slog.Logger

	// singleflight collapses concurrent regeneration requests for the
	// same issuer ID. A simpler alternative to vendoring
	// golang.org/x/sync/singleflight; this in-tree version is ~30 LoC
	// and matches the project's "no new deps unless necessary" rule.
	flight sync.Map // issuerID → *flightEntry
}

// flightEntry coordinates a single in-flight generation across
// concurrent callers. The first arrival kicks off the work; later
// arrivals wait on done and read the shared result. Pattern matches
// golang.org/x/sync/singleflight semantics for the single-call case
// (we don't need the multi-result Forget capability here).
type flightEntry struct {
	done   chan struct{}
	result *domain.CRLCacheEntry
	err    error
}

// NewCRLCacheService constructs a cache service. caSvc must already
// have its issuer registry wired (CAOperationsSvc.SetIssuerRegistry).
func NewCRLCacheService(
	cacheRepo repository.CRLCacheRepository,
	caSvc *CAOperationsSvc,
	registry *IssuerRegistry,
	logger *slog.Logger,
) *CRLCacheService {
	return &CRLCacheService{
		cacheRepo: cacheRepo,
		caSvc:     caSvc,
		registry:  registry,
		logger:    logger,
	}
}

// Get returns the cached CRL DER + thisUpdate timestamp for an issuer.
// On cache hit the path is purely a DB read (~ms). On miss or
// staleness (next_update in the past), Get triggers an immediate
// regeneration via the singleflight gate so concurrent requests
// collapse to one underlying call.
func (s *CRLCacheService) Get(ctx context.Context, issuerID string) ([]byte, time.Time, error) {
	if s.cacheRepo == nil {
		return nil, time.Time{}, errors.New("crl_cache service: cache repo not configured")
	}

	now := time.Now().UTC()
	entry, err := s.cacheRepo.Get(ctx, issuerID)
	if err != nil {
		return nil, time.Time{}, fmt.Errorf("crl_cache service get %q: %w", issuerID, err)
	}
	if entry != nil && !entry.IsStale(now) {
		return entry.CRLDER, entry.ThisUpdate, nil
	}

	// Miss or stale → regenerate behind the singleflight gate.
	fresh, err := s.regenerateOne(ctx, issuerID)
	if err != nil {
		return nil, time.Time{}, err
	}
	return fresh.CRLDER, fresh.ThisUpdate, nil
}

// RegenerateAll walks every issuer in the registry, calling
// regenerateOne for each. Per-issuer failures are logged + audited
// (via crl_generation_events); a single bad issuer does not stop
// the others. Called by scheduler.crlGenerationLoop on each tick.
//
// Issuers whose connector returns nil from GenerateCRL (e.g., ACME,
// Vault PKI, DigiCert — they manage their own CRL distribution) are
// skipped silently; the regenerateOne path detects nil and treats it
// as "no CRL to cache" rather than an error.
func (s *CRLCacheService) RegenerateAll(ctx context.Context) {
	if s.registry == nil {
		s.logger.Warn("CRL cache RegenerateAll: registry not configured; nothing to do")
		return
	}

	issuers := s.registry.List()
	for issuerID := range issuers {
		select {
		case <-ctx.Done():
			s.logger.Warn("CRL cache RegenerateAll: ctx cancelled mid-cycle",
				"completed", issuerID)
			return
		default:
		}

		if _, err := s.regenerateOne(ctx, issuerID); err != nil {
			// regenerateOne already logs + audits the failure; log here
			// only at debug level to avoid double-noise.
			s.logger.Debug("CRL cache RegenerateAll: per-issuer failure",
				"issuer_id", issuerID, "error", err)
		}
	}
}

// regenerateOne is the singleflight-gated worker. The first concurrent
// call for an issuer ID executes the generation; later calls block on
// the in-flight entry's done channel and return the same result.
//
// The gate is released in a defer so callers can rely on subsequent
// calls (after the result is observed) starting a fresh generation.
func (s *CRLCacheService) regenerateOne(ctx context.Context, issuerID string) (*domain.CRLCacheEntry, error) {
	// Check for an in-flight generation. LoadOrStore atomically:
	//   - If absent: stores our entry as the in-flight one and returns
	//     it; we kick off the work.
	//   - If present: returns the existing entry; we wait on it.
	mine := &flightEntry{done: make(chan struct{})}
	actual, loaded := s.flight.LoadOrStore(issuerID, mine)
	entry := actual.(*flightEntry)

	if loaded {
		// Another goroutine is already generating. Wait for them.
		select {
		case <-ctx.Done():
			return nil, ctx.Err()
		case <-entry.done:
		}
		if entry.err != nil {
			return nil, entry.err
		}
		return entry.result, nil
	}

	// We are the leader; do the work and signal others on done.
	defer func() {
		s.flight.Delete(issuerID)
		close(mine.done)
	}()

	mine.result, mine.err = s.doRegenerate(ctx, issuerID)
	return mine.result, mine.err
}

// doRegenerate is the actual work: ask CAOperationsSvc to build the
// CRL DER, parse it to recover thisUpdate/nextUpdate, persist into
// crl_cache, and record an audit event in crl_generation_events.
func (s *CRLCacheService) doRegenerate(ctx context.Context, issuerID string) (*domain.CRLCacheEntry, error) {
	if s.caSvc == nil {
		return nil, errors.New("crl_cache service: caSvc not configured")
	}

	startedAt := time.Now().UTC()

	// Build the CRL via the existing on-demand path.
	derBytes, err := s.caSvc.GenerateDERCRL(ctx, issuerID)
	if err != nil {
		s.recordEvent(ctx, &domain.CRLGenerationEvent{
			IssuerID:  issuerID,
			StartedAt: startedAt,
			Duration:  time.Since(startedAt),
			Succeeded: false,
			Error:     err.Error(),
		})
		return nil, fmt.Errorf("crl_cache service generate %q: %w", issuerID, err)
	}

	// Parse to extract thisUpdate / nextUpdate / number / count.
	parsed, perr := x509.ParseRevocationList(derBytes)
	if perr != nil {
		s.recordEvent(ctx, &domain.CRLGenerationEvent{
			IssuerID:  issuerID,
			StartedAt: startedAt,
			Duration:  time.Since(startedAt),
			Succeeded: false,
			Error:     "parse generated CRL: " + perr.Error(),
		})
		return nil, fmt.Errorf("crl_cache service parse %q: %w", issuerID, perr)
	}

	crlNumber := int64(0)
	if parsed.Number != nil {
		crlNumber = parsed.Number.Int64()
	}

	entry := &domain.CRLCacheEntry{
		IssuerID:           issuerID,
		CRLDER:             derBytes,
		CRLNumber:          crlNumber,
		ThisUpdate:         parsed.ThisUpdate,
		NextUpdate:         parsed.NextUpdate,
		GeneratedAt:        startedAt,
		GenerationDuration: time.Since(startedAt),
		RevokedCount:       len(parsed.RevokedCertificateEntries),
	}
	if err := s.cacheRepo.Put(ctx, entry); err != nil {
		s.recordEvent(ctx, &domain.CRLGenerationEvent{
			IssuerID:  issuerID,
			CRLNumber: crlNumber,
			StartedAt: startedAt,
			Duration:  time.Since(startedAt),
			Succeeded: false,
			Error:     "persist cache row: " + err.Error(),
		})
		return nil, fmt.Errorf("crl_cache service persist %q: %w", issuerID, err)
	}

	s.recordEvent(ctx, &domain.CRLGenerationEvent{
		IssuerID:     issuerID,
		CRLNumber:    crlNumber,
		Duration:     entry.GenerationDuration,
		RevokedCount: entry.RevokedCount,
		StartedAt:    startedAt,
		Succeeded:    true,
	})

	s.logger.Info("CRL pre-generated and cached",
		"issuer_id", issuerID,
		"crl_number", crlNumber,
		"revoked_count", entry.RevokedCount,
		"this_update", entry.ThisUpdate,
		"next_update", entry.NextUpdate,
		"duration_ms", entry.GenerationDuration.Milliseconds())
	return entry, nil
}

// recordEvent persists a generation event but does NOT propagate
// failure-to-record back to the caller — the event log is a
// best-effort audit trail; missing it should not turn a successful
// CRL generation into an error.
func (s *CRLCacheService) recordEvent(ctx context.Context, evt *domain.CRLGenerationEvent) {
	if s.cacheRepo == nil {
		return
	}
	if err := s.cacheRepo.RecordGenerationEvent(ctx, evt); err != nil {
		s.logger.Warn("crl_cache service: failed to record generation event",
			"issuer_id", evt.IssuerID, "error", err)
	}
}