certctl/internal/scep/intune/rate_limit.go

package intune

import (
	"errors"
	"sync"
	"time"
)

// SCEP RFC 8894 + Intune master bundle Phase 8.6.
//
// PerDeviceRateLimiter is the second line of defense behind the replay cache
// from Phase 7. The replay cache catches the same challenge being submitted
// twice (within the challenge TTL); this rate limiter catches a compromised
// Connector signing key (or a stolen key+cert pair) issuing many DIFFERENT
// valid challenges for the same device subject in a short window.
//
// Threat model:
//
//   - Replay cache (Phase 7): nonce-keyed; catches duplicate submission.
//   - This limiter: (Subject, Issuer)-keyed; catches enrollment-flooding.
//
// Default: 3 enrollments per (device GUID, Connector identity) per 24h.
//
// Sizing: 100,000 distinct device entries (matches the replay cache cap).
// At-cap: oldest entry evicted (small janitor pass) to avoid unbounded
// memory growth on a fleet that grows past the cap.
//
// Why a hand-rolled token bucket instead of pulling in golang.org/x/time/rate:
// the rate package is in go.sum as an indirect transitive but NOT a direct
// dep. Adding it would create a new direct dep relationship for ~30 LoC of
// state machine. The hand-rolled version below uses only stdlib (sync.Mutex
// + time.Time arithmetic) and is small enough to fit on one screen.
//
// Algorithm: each (Subject, Issuer) key maps to a bucket holding a window's
// worth of recent enrollment timestamps. On Allow, the bucket prunes
// timestamps older than (now - window) and either appends the current
// timestamp + returns true, or rejects + returns false when the post-prune
// count is already at the cap. This is the "sliding window log" rate
// limiter — exact (no token-leak rounding); O(N_per_key) per-call but N is
// bounded by the cap (3 by default), so effectively O(1).

// ErrRateLimited is the typed error returned when the per-device rate limit
// fires. The handler maps this to a CertRep FAILURE with badRequest failInfo
// + the `rate_limited` metric label.
var ErrRateLimited = errors.New("intune: per-device rate limit exceeded for this (subject, issuer) within the configured window")

// PerDeviceRateLimiter is a sliding-window-log rate limiter keyed by
// (Subject, Issuer) tuples derived from a parsed challenge claim.
//
// Concurrency: the limiter is safe for concurrent Allow calls. The internal
// map is guarded by a mutex; the per-key slices are mutated only while the
// mutex is held.
type PerDeviceRateLimiter struct {
	mu       sync.Mutex
	buckets  map[string][]time.Time // key → sliding window of timestamps
	maxN     int                    // max enrollments per window
	window   time.Duration          // window length (default 24h)
	cap      int                    // max keys before LRU eviction kicks in
	disabled bool                   // maxN == 0 → all Allow calls return nil
}

// NewPerDeviceRateLimiter returns a limiter with the given per-key cap +
// window. maxN ≤ 0 disables the limiter (all Allow calls return nil); this
// is operator opt-out for the rare case where the per-device cap is
// undesirable (e.g. test harnesses, sketchpad deploys).
//
// Window defaults to 24h when zero. Map cap defaults to 100,000 when zero
// (matches the replay cache cap; see internal/scep/intune/replay.go).
func NewPerDeviceRateLimiter(maxN int, window time.Duration, mapCap int) *PerDeviceRateLimiter {
	if window <= 0 {
		window = 24 * time.Hour
	}
	if mapCap <= 0 {
		mapCap = 100_000
	}
	return &PerDeviceRateLimiter{
		buckets:  make(map[string][]time.Time),
		maxN:     maxN,
		window:   window,
		cap:      mapCap,
		disabled: maxN <= 0,
	}
}

// Allow checks whether an enrollment for the given (subject, issuer) tuple
// is permitted right now. Returns nil when allowed (and records the timestamp
// in the bucket) or ErrRateLimited when the bucket is at maxN.
//
// Empty subject is treated as "skip the limiter" — the caller's claim
// validation should have rejected an empty-subject claim already; this is
// belt-and-suspenders to prevent a single empty-subject bucket from
// becoming a fleet-wide chokepoint. The Connector emits non-empty subject
// (device GUID) on every legitimate challenge.
func (l *PerDeviceRateLimiter) Allow(subject, issuer string, now time.Time) error {
	if l.disabled {
		return nil
	}
	if subject == "" {
		// Caller's claim validation should reject empty-subject upstream;
		// this short-circuit is defense-in-depth so a misconfigured
		// Connector can't DoS us via the rate-limit path.
		return nil
	}
	key := subject + "|" + issuer

	l.mu.Lock()
	defer l.mu.Unlock()

	// At-cap eviction: when the map is full, drop the oldest entry by
	// finding the bucket whose newest timestamp is the smallest. O(N) but
	// rarely fires; the prune-on-Allow path keeps most buckets short-lived.
	if len(l.buckets) >= l.cap {
		l.evictOldestLocked(now)
	}

	bucket := l.buckets[key]
	bucket = pruneOlderThan(bucket, now.Add(-l.window))

	if len(bucket) >= l.maxN {
		// Don't append; over the limit. Persist the pruned bucket so the
		// next call sees the most-recently-pruned state.
		l.buckets[key] = bucket
		return ErrRateLimited
	}

	bucket = append(bucket, now)
	l.buckets[key] = bucket
	return nil
}

// pruneOlderThan returns the slice with all entries strictly before
// `cutoff` removed. Preserves order (timestamps are appended in increasing
// time, so a single linear scan from the front suffices).
func pruneOlderThan(b []time.Time, cutoff time.Time) []time.Time {
	i := 0
	for i < len(b) && b[i].Before(cutoff) {
		i++
	}
	if i == 0 {
		return b
	}
	// Copy-shrink to release the underlying-array memory eventually
	// (otherwise the slice would hold a reference to the older entries
	// indefinitely until a re-allocation).
	out := make([]time.Time, len(b)-i)
	copy(out, b[i:])
	return out
}

// evictOldestLocked drops the map entry whose newest timestamp is the
// oldest. Called under l.mu. O(N_keys) per eviction; at-cap is rare in
// practice (caps are sized for fleet steady-state).
func (l *PerDeviceRateLimiter) evictOldestLocked(now time.Time) {
	var (
		oldestKey string
		oldestTs  time.Time
		first     = true
	)
	for k, b := range l.buckets {
		if len(b) == 0 {
			// Empty bucket — drop it immediately, no candidate scan needed.
			delete(l.buckets, k)
			return
		}
		newest := b[len(b)-1]
		if first || newest.Before(oldestTs) {
			oldestKey = k
			oldestTs = newest
			first = false
		}
	}
	if oldestKey != "" {
		delete(l.buckets, oldestKey)
	}
	// Suppress unused-parameter warning for `now` in case the eviction
	// strategy changes (e.g. swap to LRU keyed by time of last Allow).
	_ = now
}

// Len returns the approximate number of distinct (subject, issuer) keys
// currently tracked. For observability + tests; not load-stable under
// concurrent Allow calls.
func (l *PerDeviceRateLimiter) Len() int {
	l.mu.Lock()
	defer l.mu.Unlock()
	return len(l.buckets)
}

// Disabled reports whether the limiter is in opt-out mode (maxN ≤ 0).
// Useful for handler-side gating + admin-endpoint observability.
func (l *PerDeviceRateLimiter) Disabled() bool {
	return l.disabled
}