certctl/internal/service/issuance_metrics_test.go

// Copyright (c) certctl
// SPDX-License-Identifier: BSL-1.1

package service

import (
	"context"
	"errors"
	"net"
	"sync"
	"testing"
	"time"
)

// TestIssuanceMetrics_RecordAndSnapshot exercises the happy-path
// counter + histogram + failure recording. Asserts:
//   - SnapshotCounters returns the expected (issuer_type, outcome, count) tuples
//   - SnapshotDurations returns cumulative bucket counts
//   - SnapshotFailures returns the expected (issuer_type, error_class, count) tuples
//   - BucketBoundaries returns a copy that doesn't share backing storage
func TestIssuanceMetrics_RecordAndSnapshot(t *testing.T) {
	m := NewIssuanceMetrics(DefaultIssuanceBucketBoundaries)

	// Record three issuances: two success (one fast, one slow), one failure.
	m.RecordIssuance("local", "success", 50*time.Millisecond) // 0.05 bucket
	m.RecordIssuance("local", "success", 2*time.Second)       // 2.5 bucket
	m.RecordIssuance("digicert", "failure", 90*time.Second)   // 120 bucket
	m.RecordFailure("digicert", "rate_limited")

	counters := m.SnapshotCounters()
	if len(counters) != 2 {
		t.Fatalf("expected 2 counter entries, got %d", len(counters))
	}
	for _, c := range counters {
		switch {
		case c.IssuerType == "local" && c.Outcome == "success":
			if c.Count != 2 {
				t.Errorf("local/success: want 2, got %d", c.Count)
			}
		case c.IssuerType == "digicert" && c.Outcome == "failure":
			if c.Count != 1 {
				t.Errorf("digicert/failure: want 1, got %d", c.Count)
			}
		default:
			t.Errorf("unexpected counter entry: %+v", c)
		}
	}

	failures := m.SnapshotFailures()
	if len(failures) != 1 {
		t.Fatalf("expected 1 failure entry, got %d", len(failures))
	}
	if failures[0].IssuerType != "digicert" || failures[0].ErrorClass != "rate_limited" || failures[0].Count != 1 {
		t.Errorf("unexpected failure entry: %+v", failures[0])
	}

	durations := m.SnapshotDurations()
	if len(durations) != 2 {
		t.Fatalf("expected 2 duration entries, got %d", len(durations))
	}

	// BucketBoundaries: returned slice must be a copy.
	b1 := m.BucketBoundaries()
	b2 := m.BucketBoundaries()
	if &b1[0] == &b2[0] {
		t.Error("BucketBoundaries should return a copy, not shared storage")
	}
}

// TestIssuanceMetrics_HistogramCumulative pins the cumulative-buckets
// contract. Prometheus histograms require buckets to be cumulative —
// `le=0.5` includes everything <= 0.5, including <= 0.05 and <= 0.1.
// Off-by-one here corrupts every quantile query downstream.
func TestIssuanceMetrics_HistogramCumulative(t *testing.T) {
	m := NewIssuanceMetrics([]float64{0.1, 0.5, 1.0})

	// Observe 100ms (= 0.1s exactly).
	m.RecordIssuance("local", "success", 100*time.Millisecond)

	durs := m.SnapshotDurations()
	if len(durs) != 1 {
		t.Fatalf("expected 1 duration entry, got %d", len(durs))
	}

	// Boundaries: [0.1, 0.5, 1.0]. 100ms falls into 0.1 bucket and
	// every larger bucket (cumulative). Sum = 0.1, count = 1.
	want := []uint64{1, 1, 1}
	for i, w := range want {
		if durs[0].Buckets[i] != w {
			t.Errorf("bucket[%d]: want %d, got %d", i, w, durs[0].Buckets[i])
		}
	}
	if durs[0].Sum < 0.099 || durs[0].Sum > 0.101 {
		t.Errorf("sum: want ~0.1, got %v", durs[0].Sum)
	}
	if durs[0].Count != 1 {
		t.Errorf("count: want 1, got %d", durs[0].Count)
	}

	// Observe 750ms — falls into 1.0 bucket only (>0.1, >0.5).
	m.RecordIssuance("local", "success", 750*time.Millisecond)

	durs = m.SnapshotDurations()
	want = []uint64{1, 1, 2} // 100ms in all 3, 750ms in only the 1.0 bucket
	for i, w := range want {
		if durs[0].Buckets[i] != w {
			t.Errorf("after 750ms — bucket[%d]: want %d, got %d", i, w, durs[0].Buckets[i])
		}
	}
}

// TestIssuanceMetrics_Concurrency stresses RecordIssuance under 100
// goroutines × 1000 ops to assert atomic counter integrity. Race-
// detector clean is non-optional for this test (the whole point of
// IssuanceMetrics is concurrent recording from many service
// goroutines).
func TestIssuanceMetrics_Concurrency(t *testing.T) {
	m := NewIssuanceMetrics(DefaultIssuanceBucketBoundaries)

	const goroutines = 100
	const opsPerGoroutine = 1000

	var wg sync.WaitGroup
	wg.Add(goroutines)
	for i := 0; i < goroutines; i++ {
		go func() {
			defer wg.Done()
			for j := 0; j < opsPerGoroutine; j++ {
				m.RecordIssuance("local", "success", 50*time.Millisecond)
			}
		}()
	}
	wg.Wait()

	counters := m.SnapshotCounters()
	if len(counters) != 1 {
		t.Fatalf("expected 1 counter entry, got %d", len(counters))
	}
	wantTotal := uint64(goroutines * opsPerGoroutine)
	if counters[0].Count != wantTotal {
		t.Errorf("counter under contention: want %d, got %d", wantTotal, counters[0].Count)
	}

	durs := m.SnapshotDurations()
	if durs[0].Count != wantTotal {
		t.Errorf("histogram count under contention: want %d, got %d", wantTotal, durs[0].Count)
	}
}

// TestClassifyError exercises every branch of the closed-enum
// classifier. The classification logic is the load-bearing piece of
// the failure metric — misclassification doesn't break operators, but
// it makes their alerts noisier. Each enum value has at least one
// representative input.
func TestClassifyError(t *testing.T) {
	cases := []struct {
		name string
		err  error
		want string
	}{
		{"context_canceled", context.Canceled, "timeout"},
		{"context_deadline", context.DeadlineExceeded, "timeout"},
		{"timeout_substring", errors.New("operation deadline exceeded"), "timeout"},
		{"i_o_timeout", errors.New("read tcp: i/o timeout"), "timeout"},
		{"net_op_error", &net.OpError{Op: "dial", Net: "tcp", Err: errors.New("connection refused")}, "network"},
		{"unauthorized_4xx", errors.New("DigiCert: 401 Unauthorized"), "auth"},
		{"access_denied_aws", errors.New("AccessDeniedException: not authorized"), "auth"},
		{"forbidden_403", errors.New("forbidden: insufficient permissions"), "auth"},
		{"rate_limited_429", errors.New("Sectigo: 429 too many requests"), "rate_limited"},
		{"throttled", errors.New("ThrottlingException: rate exceeded"), "rate_limited"},
		{"validation_csr", errors.New("malformed CSR: invalid PEM block"), "validation"},
		{"validation_invalid", errors.New("invalid signing algorithm"), "validation"},
		{"upstream_503", errors.New("ServiceUnavailable: 503"), "upstream_5xx"},
		{"upstream_500_internal", errors.New("Internal Server Error: 500"), "upstream_5xx"},
		{"upstream_404", errors.New("NotFound: 404 cert not found"), "upstream_4xx"},
		{"network_no_host", errors.New("dial tcp: no such host"), "network"},
		{"other_unmatched", errors.New("something completely unexpected happened"), "other"},
	}

	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			got := ClassifyError(tc.err)
			if got != tc.want {
				t.Errorf("ClassifyError(%q): want %q, got %q", tc.err.Error(), tc.want, got)
			}
		})
	}

	// Special case: nil → "" so callers that accidentally call us
	// with a nil err don't bump the failure counter.
	if got := ClassifyError(nil); got != "" {
		t.Errorf("ClassifyError(nil): want \"\", got %q", got)
	}
}