fix(m-9): aggregate per-endpoint scan errors in NetworkScanService

Before this fix, RunScan declared `scanErrors []string` but never appended to it. As a result: - the summary Info log ("network target scan completed") always reported `"errors": 0`, regardless of how many endpoints failed - the DiscoveryReport's `Errors` field — stored on the scan record and surfaced in the GUI scan history — was always nil Operators who needed to understand scan failures had to enable Debug logging and grep through the noise of expected sweep-scan connection refusals. The per-endpoint log level (Debug) is deliberate and correct — scanning a /24 typically produces 200+ connection-refused results, and logging each at Warn would create massive log spam at default verbosity. The bug was the silent loss of the aggregate count. This commit: - extracts the partitioning logic into `collectScanResults`, a pure method that splits per-endpoint results into discovered certificate entries and a list of endpoint error strings - populates the errors list with "<address>: <error>" so the scan record correlates failures back to specific endpoints - preserves the existing Debug-level per-endpoint log (sweep noise discipline) — no change to default-verbosity log output The summary Info log's "errors" field and the DiscoveryReport's Errors field now reflect the true failure count. Debug detail remains available for operators diagnosing specific endpoints. Audit scope note: the M-9 finding narrative implied broad Debug-level hiding of real errors across AWS SM, Azure KV, GCP SM, and network scan sentinel agents. On investigation, the three cloud-discovery connectors (awssm, azurekv, gcpsm) already use appropriate Warn/Error discipline for per-item and root-level failures. Only the network scanner had a silent observability gap, and it was a missed append rather than a misapplied log level. See audit resolution log for full details. CWE: CWE-778 (Insufficient Logging) — aggregate failure count lost. Tests: 4 new unit tests on collectScanResults covering the aggregation path (success + failure mix), all-success, all-failed, and empty-input degenerate cases. All tests pass with -race. Verification: - go build ./cmd/server/... ./cmd/agent/... ./cmd/mcp-server/... ./cmd/cli/... exit 0 - go vet ./... exit 0 - go test -race -count=1 -timeout 300s [full CI race path] exit 0 - golangci-lint run ./... --timeout 5m (v2.11.4) 0 issues - govulncheck ./... (@latest) 0 in-code vulnerabilities - go test -count=1 -cover ./internal/service/... 68.0% (> 55% threshold) Invariants preserved: - collectScanResults signature: method on *NetworkScanService, input []domain.NetworkScanResult, return ([]DiscoveredCertEntry, []string) - Debug log key names unchanged ("address", "error") - DiscoveryReport schema unchanged (Errors field already existed) - Sentinel agent ID "server-scanner" unchanged - No migration, no API, no wire-format change Refs: M-9 Medium finding; audit resolution log appended in follow-up commit on workspace-level audit report.
2026-06-07 16:31:33 +00:00 · 2026-04-18 02:34:14 +00:00
parent 5a6ec39cfd
commit ef670fa6da
2 changed files with 161 additions and 15 deletions
@@ -235,21 +235,19 @@ func (s *NetworkScanService) scanTarget(ctx context.Context, target *domain.Netw
 	timeout := time.Duration(target.TimeoutMs) * time.Millisecond
 	results := s.scanEndpoints(ctx, endpoints, timeout)

-	// Collect discovered cert entries
-	var entries []domain.DiscoveredCertEntry
-	var scanErrors []string
-	for _, result := range results {
-		if result.Error != "" {
-			// Only log connection errors at debug level (many hosts won't have TLS)
-			if s.logger != nil {
-				s.logger.Debug("scan endpoint error",
-					"address", result.Address,
-					"error", result.Error)
-			}
-			continue
-		}
-		entries = append(entries, result.Certs...)
-	}
+	// Collect discovered cert entries and per-endpoint errors.
+	//
+	// M-9 (operator-observability): before this fix, scanErrors was declared
+	// but never appended to, so the "errors" count in the summary Info log
+	// and the Errors field on the DiscoveryReport were always zero/nil —
+	// silently hiding per-endpoint failures from operators and from the
+	// downstream scan history record. Per-endpoint failures are still logged
+	// at Debug (sweep scans generate high connection-refused noise by design
+	// — most hosts in a CIDR won't have TLS on the probed port), but the
+	// aggregate count and the report's Errors field now reflect reality so
+	// operators can see, via the scan summary and the stored scan record,
+	// how many endpoints failed without having to enable Debug logging.
+	entries, scanErrors := s.collectScanResults(results)

 	scanDuration := time.Since(startTime)
 	if s.logger != nil {
@@ -385,6 +383,44 @@ func incrementIP(ip net.IP) {
 	}
 }

+// collectScanResults partitions per-endpoint scan results into discovered
+// certificate entries and a list of per-endpoint error strings.
+//
+// M-9 (operator-observability): the summary Info log and the DiscoveryReport
+// both report the count of endpoints that failed to probe. Before this helper
+// existed, the caller accumulated entries but never populated the errors
+// slice, so the aggregate error count was always zero and the scan record's
+// Errors field was always nil — silently hiding per-endpoint failures.
+//
+// Per-endpoint errors remain logged at Debug (sweep scans generate high
+// connection-refused noise by design — most hosts in a CIDR won't have TLS
+// on the probed port). Aggregation surfaces the count at Info, preserving
+// Debug-level detail for operators who want it without creating log spam
+// at default verbosity.
+func (s *NetworkScanService) collectScanResults(results []domain.NetworkScanResult) ([]domain.DiscoveredCertEntry, []string) {
+	var entries []domain.DiscoveredCertEntry
+	var scanErrors []string
+	for _, result := range results {
+		if result.Error != "" {
+			// Debug-level is intentional: a sweep scan of a /24 typically
+			// produces 200+ connection-refused results, and logging each
+			// at Warn would create log spam at default verbosity. The
+			// aggregate count in the Info-level scan-completed log surfaces
+			// the failure volume to operators; Debug provides the detail
+			// when diagnosing a specific endpoint.
+			if s.logger != nil {
+				s.logger.Debug("scan endpoint error",
+					"address", result.Address,
+					"error", result.Error)
+			}
+			scanErrors = append(scanErrors, fmt.Sprintf("%s: %s", result.Address, result.Error))
+			continue
+		}
+		entries = append(entries, result.Certs...)
+	}
+	return entries, scanErrors
+}
+
 // scanEndpoints probes TLS endpoints concurrently and returns results.
 func (s *NetworkScanService) scanEndpoints(ctx context.Context, endpoints []string, timeout time.Duration) []domain.NetworkScanResult {
 	results := make([]domain.NetworkScanResult, len(endpoints))