Merge branch 'fix/m9-sentinel-discovery-log-levels': M-9 sentinel discovery log-level fix

2026-07-26 16:48:12 +00:00 · 2026-04-18 02:53:50 +00:00
parent 5a6ec39cfd ef670fa6da
commit 2c9602db71
2 changed files with 161 additions and 15 deletions
@@ -235,21 +235,19 @@ func (s *NetworkScanService) scanTarget(ctx context.Context, target *domain.Netw
 	timeout := time.Duration(target.TimeoutMs) * time.Millisecond
 	results := s.scanEndpoints(ctx, endpoints, timeout)

-	// Collect discovered cert entries
-	var entries []domain.DiscoveredCertEntry
-	var scanErrors []string
-	for _, result := range results {
-		if result.Error != "" {
-			// Only log connection errors at debug level (many hosts won't have TLS)
-			if s.logger != nil {
-				s.logger.Debug("scan endpoint error",
-					"address", result.Address,
-					"error", result.Error)
-			}
-			continue
-		}
-		entries = append(entries, result.Certs...)
-	}
+	// Collect discovered cert entries and per-endpoint errors.
+	//
+	// M-9 (operator-observability): before this fix, scanErrors was declared
+	// but never appended to, so the "errors" count in the summary Info log
+	// and the Errors field on the DiscoveryReport were always zero/nil —
+	// silently hiding per-endpoint failures from operators and from the
+	// downstream scan history record. Per-endpoint failures are still logged
+	// at Debug (sweep scans generate high connection-refused noise by design
+	// — most hosts in a CIDR won't have TLS on the probed port), but the
+	// aggregate count and the report's Errors field now reflect reality so
+	// operators can see, via the scan summary and the stored scan record,
+	// how many endpoints failed without having to enable Debug logging.
+	entries, scanErrors := s.collectScanResults(results)

 	scanDuration := time.Since(startTime)
 	if s.logger != nil {
@@ -385,6 +383,44 @@ func incrementIP(ip net.IP) {
 	}
 }

+// collectScanResults partitions per-endpoint scan results into discovered
+// certificate entries and a list of per-endpoint error strings.
+//
+// M-9 (operator-observability): the summary Info log and the DiscoveryReport
+// both report the count of endpoints that failed to probe. Before this helper
+// existed, the caller accumulated entries but never populated the errors
+// slice, so the aggregate error count was always zero and the scan record's
+// Errors field was always nil — silently hiding per-endpoint failures.
+//
+// Per-endpoint errors remain logged at Debug (sweep scans generate high
+// connection-refused noise by design — most hosts in a CIDR won't have TLS
+// on the probed port). Aggregation surfaces the count at Info, preserving
+// Debug-level detail for operators who want it without creating log spam
+// at default verbosity.
+func (s *NetworkScanService) collectScanResults(results []domain.NetworkScanResult) ([]domain.DiscoveredCertEntry, []string) {
+	var entries []domain.DiscoveredCertEntry
+	var scanErrors []string
+	for _, result := range results {
+		if result.Error != "" {
+			// Debug-level is intentional: a sweep scan of a /24 typically
+			// produces 200+ connection-refused results, and logging each
+			// at Warn would create log spam at default verbosity. The
+			// aggregate count in the Info-level scan-completed log surfaces
+			// the failure volume to operators; Debug provides the detail
+			// when diagnosing a specific endpoint.
+			if s.logger != nil {
+				s.logger.Debug("scan endpoint error",
+					"address", result.Address,
+					"error", result.Error)
+			}
+			scanErrors = append(scanErrors, fmt.Sprintf("%s: %s", result.Address, result.Error))
+			continue
+		}
+		entries = append(entries, result.Certs...)
+	}
+	return entries, scanErrors
+}
+
 // scanEndpoints probes TLS endpoints concurrently and returns results.
 func (s *NetworkScanService) scanEndpoints(ctx context.Context, endpoints []string, timeout time.Duration) []domain.NetworkScanResult {
 	results := make([]domain.NetworkScanResult, len(endpoints))
@@ -491,3 +491,113 @@ func TestExpandCIDR_SingleLinkLocalIP(t *testing.T) {
 		t.Errorf("expected empty for cloud metadata IP, got %v", ips)
 	}
 }
+
+// TestCollectScanResults_AggregatesErrors is the M-9 regression guard:
+// per-endpoint probe failures must accumulate into the errors slice so the
+// summary Info log and the DiscoveryReport reflect the true failure count.
+// Before the M-9 fix, scanErrors was declared but never appended to, so the
+// aggregate count was always zero and the scan record's Errors field was
+// always nil — silently hiding per-endpoint failures from operators.
+func TestCollectScanResults_AggregatesErrors(t *testing.T) {
+	svc := &NetworkScanService{}
+	results := []domain.NetworkScanResult{
+		{Address: "203.0.113.1:443", Error: "connection refused"},
+		{Address: "203.0.113.2:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "example.com"},
+		}},
+		{Address: "203.0.113.3:443", Error: "tls handshake failure"},
+		{Address: "203.0.113.4:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "internal.example.com"},
+		}},
+		{Address: "203.0.113.5:443", Error: "i/o timeout"},
+	}
+
+	entries, errs := svc.collectScanResults(results)
+
+	if len(entries) != 2 {
+		t.Errorf("expected 2 entries (one per successful probe), got %d", len(entries))
+	}
+	if len(errs) != 3 {
+		t.Fatalf("expected 3 error strings (one per failed probe), got %d: %v", len(errs), errs)
+	}
+
+	// Each error string must be non-empty and include the endpoint address so
+	// the scan record lets operators correlate failures back to endpoints
+	// without needing Debug logging enabled.
+	for i, e := range errs {
+		if e == "" {
+			t.Errorf("error[%d]: expected non-empty error string", i)
+		}
+	}
+
+	// Spot-check that address is threaded through the error strings.
+	if want := "203.0.113.1:443"; errs[0] == "" || errs[0][:len(want)] != want {
+		t.Errorf("errs[0] should start with %q, got %q", want, errs[0])
+	}
+	if want := "203.0.113.3:443"; errs[1] == "" || errs[1][:len(want)] != want {
+		t.Errorf("errs[1] should start with %q, got %q", want, errs[1])
+	}
+	if want := "203.0.113.5:443"; errs[2] == "" || errs[2][:len(want)] != want {
+		t.Errorf("errs[2] should start with %q, got %q", want, errs[2])
+	}
+}
+
+// TestCollectScanResults_AllSuccess exercises the happy path: a scan where
+// every endpoint returned certificates. The errors slice must be nil (not an
+// empty non-nil slice) so the downstream DiscoveryReport.Errors field stays
+// nil as well, preserving the JSON-omitempty behavior that callers rely on.
+func TestCollectScanResults_AllSuccess(t *testing.T) {
+	svc := &NetworkScanService{}
+	results := []domain.NetworkScanResult{
+		{Address: "203.0.113.10:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "a.example.com"},
+		}},
+		{Address: "203.0.113.11:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "b.example.com"},
+		}},
+	}
+
+	entries, errs := svc.collectScanResults(results)
+
+	if len(entries) != 2 {
+		t.Errorf("expected 2 entries, got %d", len(entries))
+	}
+	if errs != nil {
+		t.Errorf("expected nil errors slice on all-success, got %v", errs)
+	}
+}
+
+// TestCollectScanResults_AllFailed exercises the worst-case sweep: every
+// endpoint failed to probe. Entries must be nil, and every failure must be
+// recorded in the errors slice so the scan record is complete.
+func TestCollectScanResults_AllFailed(t *testing.T) {
+	svc := &NetworkScanService{}
+	results := []domain.NetworkScanResult{
+		{Address: "203.0.113.20:443", Error: "connection refused"},
+		{Address: "203.0.113.21:443", Error: "connection refused"},
+		{Address: "203.0.113.22:443", Error: "connection refused"},
+	}
+
+	entries, errs := svc.collectScanResults(results)
+
+	if entries != nil {
+		t.Errorf("expected nil entries on all-failed, got %v", entries)
+	}
+	if len(errs) != 3 {
+		t.Errorf("expected 3 error strings, got %d: %v", len(errs), errs)
+	}
+}
+
+// TestCollectScanResults_Empty guards against a degenerate empty-input case
+// (scanEndpoints returns no results, e.g. if ctx was cancelled before the
+// first probe ran). Both return slices must be nil.
+func TestCollectScanResults_Empty(t *testing.T) {
+	svc := &NetworkScanService{}
+	entries, errs := svc.collectScanResults(nil)
+	if entries != nil {
+		t.Errorf("expected nil entries for empty input, got %v", entries)
+	}
+	if errs != nil {
+		t.Errorf("expected nil errors for empty input, got %v", errs)
+	}
+}