fix(m-9): aggregate per-endpoint scan errors in NetworkScanService

Before this fix, RunScan declared `scanErrors []string` but never appended to it. As a result: - the summary Info log ("network target scan completed") always reported `"errors": 0`, regardless of how many endpoints failed - the DiscoveryReport's `Errors` field — stored on the scan record and surfaced in the GUI scan history — was always nil Operators who needed to understand scan failures had to enable Debug logging and grep through the noise of expected sweep-scan connection refusals. The per-endpoint log level (Debug) is deliberate and correct — scanning a /24 typically produces 200+ connection-refused results, and logging each at Warn would create massive log spam at default verbosity. The bug was the silent loss of the aggregate count. This commit: - extracts the partitioning logic into `collectScanResults`, a pure method that splits per-endpoint results into discovered certificate entries and a list of endpoint error strings - populates the errors list with "<address>: <error>" so the scan record correlates failures back to specific endpoints - preserves the existing Debug-level per-endpoint log (sweep noise discipline) — no change to default-verbosity log output The summary Info log's "errors" field and the DiscoveryReport's Errors field now reflect the true failure count. Debug detail remains available for operators diagnosing specific endpoints. Audit scope note: the M-9 finding narrative implied broad Debug-level hiding of real errors across AWS SM, Azure KV, GCP SM, and network scan sentinel agents. On investigation, the three cloud-discovery connectors (awssm, azurekv, gcpsm) already use appropriate Warn/Error discipline for per-item and root-level failures. Only the network scanner had a silent observability gap, and it was a missed append rather than a misapplied log level. See audit resolution log for full details. CWE: CWE-778 (Insufficient Logging) — aggregate failure count lost. Tests: 4 new unit tests on collectScanResults covering the aggregation path (success + failure mix), all-success, all-failed, and empty-input degenerate cases. All tests pass with -race. Verification: - go build ./cmd/server/... ./cmd/agent/... ./cmd/mcp-server/... ./cmd/cli/... exit 0 - go vet ./... exit 0 - go test -race -count=1 -timeout 300s [full CI race path] exit 0 - golangci-lint run ./... --timeout 5m (v2.11.4) 0 issues - govulncheck ./... (@latest) 0 in-code vulnerabilities - go test -count=1 -cover ./internal/service/... 68.0% (> 55% threshold) Invariants preserved: - collectScanResults signature: method on *NetworkScanService, input []domain.NetworkScanResult, return ([]DiscoveredCertEntry, []string) - Debug log key names unchanged ("address", "error") - DiscoveryReport schema unchanged (Errors field already existed) - Sentinel agent ID "server-scanner" unchanged - No migration, no API, no wire-format change Refs: M-9 Medium finding; audit resolution log appended in follow-up commit on workspace-level audit report.
2026-07-28 04:49:42 +00:00 · 2026-04-18 02:34:14 +00:00
parent 59e0fa9cdd
commit 875f433c52
2 changed files with 161 additions and 15 deletions
@@ -235,21 +235,19 @@ func (s *NetworkScanService) scanTarget(ctx context.Context, target *domain.Netw
 	timeout := time.Duration(target.TimeoutMs) * time.Millisecond
 	results := s.scanEndpoints(ctx, endpoints, timeout)

-	// Collect discovered cert entries
-	var entries []domain.DiscoveredCertEntry
-	var scanErrors []string
-	for _, result := range results {
-		if result.Error != "" {
-			// Only log connection errors at debug level (many hosts won't have TLS)
-			if s.logger != nil {
-				s.logger.Debug("scan endpoint error",
-					"address", result.Address,
-					"error", result.Error)
-			}
-			continue
-		}
-		entries = append(entries, result.Certs...)
-	}
+	// Collect discovered cert entries and per-endpoint errors.
+	//
+	// M-9 (operator-observability): before this fix, scanErrors was declared
+	// but never appended to, so the "errors" count in the summary Info log
+	// and the Errors field on the DiscoveryReport were always zero/nil —
+	// silently hiding per-endpoint failures from operators and from the
+	// downstream scan history record. Per-endpoint failures are still logged
+	// at Debug (sweep scans generate high connection-refused noise by design
+	// — most hosts in a CIDR won't have TLS on the probed port), but the
+	// aggregate count and the report's Errors field now reflect reality so
+	// operators can see, via the scan summary and the stored scan record,
+	// how many endpoints failed without having to enable Debug logging.
+	entries, scanErrors := s.collectScanResults(results)

 	scanDuration := time.Since(startTime)
 	if s.logger != nil {
@@ -385,6 +383,44 @@ func incrementIP(ip net.IP) {
 	}
 }

+// collectScanResults partitions per-endpoint scan results into discovered
+// certificate entries and a list of per-endpoint error strings.
+//
+// M-9 (operator-observability): the summary Info log and the DiscoveryReport
+// both report the count of endpoints that failed to probe. Before this helper
+// existed, the caller accumulated entries but never populated the errors
+// slice, so the aggregate error count was always zero and the scan record's
+// Errors field was always nil — silently hiding per-endpoint failures.
+//
+// Per-endpoint errors remain logged at Debug (sweep scans generate high
+// connection-refused noise by design — most hosts in a CIDR won't have TLS
+// on the probed port). Aggregation surfaces the count at Info, preserving
+// Debug-level detail for operators who want it without creating log spam
+// at default verbosity.
+func (s *NetworkScanService) collectScanResults(results []domain.NetworkScanResult) ([]domain.DiscoveredCertEntry, []string) {
+	var entries []domain.DiscoveredCertEntry
+	var scanErrors []string
+	for _, result := range results {
+		if result.Error != "" {
+			// Debug-level is intentional: a sweep scan of a /24 typically
+			// produces 200+ connection-refused results, and logging each
+			// at Warn would create log spam at default verbosity. The
+			// aggregate count in the Info-level scan-completed log surfaces
+			// the failure volume to operators; Debug provides the detail
+			// when diagnosing a specific endpoint.
+			if s.logger != nil {
+				s.logger.Debug("scan endpoint error",
+					"address", result.Address,
+					"error", result.Error)
+			}
+			scanErrors = append(scanErrors, fmt.Sprintf("%s: %s", result.Address, result.Error))
+			continue
+		}
+		entries = append(entries, result.Certs...)
+	}
+	return entries, scanErrors
+}
+
 // scanEndpoints probes TLS endpoints concurrently and returns results.
 func (s *NetworkScanService) scanEndpoints(ctx context.Context, endpoints []string, timeout time.Duration) []domain.NetworkScanResult {
 	results := make([]domain.NetworkScanResult, len(endpoints))
@@ -491,3 +491,113 @@ func TestExpandCIDR_SingleLinkLocalIP(t *testing.T) {
 		t.Errorf("expected empty for cloud metadata IP, got %v", ips)
 	}
 }
+
+// TestCollectScanResults_AggregatesErrors is the M-9 regression guard:
+// per-endpoint probe failures must accumulate into the errors slice so the
+// summary Info log and the DiscoveryReport reflect the true failure count.
+// Before the M-9 fix, scanErrors was declared but never appended to, so the
+// aggregate count was always zero and the scan record's Errors field was
+// always nil — silently hiding per-endpoint failures from operators.
+func TestCollectScanResults_AggregatesErrors(t *testing.T) {
+	svc := &NetworkScanService{}
+	results := []domain.NetworkScanResult{
+		{Address: "203.0.113.1:443", Error: "connection refused"},
+		{Address: "203.0.113.2:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "example.com"},
+		}},
+		{Address: "203.0.113.3:443", Error: "tls handshake failure"},
+		{Address: "203.0.113.4:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "internal.example.com"},
+		}},
+		{Address: "203.0.113.5:443", Error: "i/o timeout"},
+	}
+
+	entries, errs := svc.collectScanResults(results)
+
+	if len(entries) != 2 {
+		t.Errorf("expected 2 entries (one per successful probe), got %d", len(entries))
+	}
+	if len(errs) != 3 {
+		t.Fatalf("expected 3 error strings (one per failed probe), got %d: %v", len(errs), errs)
+	}
+
+	// Each error string must be non-empty and include the endpoint address so
+	// the scan record lets operators correlate failures back to endpoints
+	// without needing Debug logging enabled.
+	for i, e := range errs {
+		if e == "" {
+			t.Errorf("error[%d]: expected non-empty error string", i)
+		}
+	}
+
+	// Spot-check that address is threaded through the error strings.
+	if want := "203.0.113.1:443"; errs[0] == "" || errs[0][:len(want)] != want {
+		t.Errorf("errs[0] should start with %q, got %q", want, errs[0])
+	}
+	if want := "203.0.113.3:443"; errs[1] == "" || errs[1][:len(want)] != want {
+		t.Errorf("errs[1] should start with %q, got %q", want, errs[1])
+	}
+	if want := "203.0.113.5:443"; errs[2] == "" || errs[2][:len(want)] != want {
+		t.Errorf("errs[2] should start with %q, got %q", want, errs[2])
+	}
+}
+
+// TestCollectScanResults_AllSuccess exercises the happy path: a scan where
+// every endpoint returned certificates. The errors slice must be nil (not an
+// empty non-nil slice) so the downstream DiscoveryReport.Errors field stays
+// nil as well, preserving the JSON-omitempty behavior that callers rely on.
+func TestCollectScanResults_AllSuccess(t *testing.T) {
+	svc := &NetworkScanService{}
+	results := []domain.NetworkScanResult{
+		{Address: "203.0.113.10:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "a.example.com"},
+		}},
+		{Address: "203.0.113.11:443", Certs: []domain.DiscoveredCertEntry{
+			{CommonName: "b.example.com"},
+		}},
+	}
+
+	entries, errs := svc.collectScanResults(results)
+
+	if len(entries) != 2 {
+		t.Errorf("expected 2 entries, got %d", len(entries))
+	}
+	if errs != nil {
+		t.Errorf("expected nil errors slice on all-success, got %v", errs)
+	}
+}
+
+// TestCollectScanResults_AllFailed exercises the worst-case sweep: every
+// endpoint failed to probe. Entries must be nil, and every failure must be
+// recorded in the errors slice so the scan record is complete.
+func TestCollectScanResults_AllFailed(t *testing.T) {
+	svc := &NetworkScanService{}
+	results := []domain.NetworkScanResult{
+		{Address: "203.0.113.20:443", Error: "connection refused"},
+		{Address: "203.0.113.21:443", Error: "connection refused"},
+		{Address: "203.0.113.22:443", Error: "connection refused"},
+	}
+
+	entries, errs := svc.collectScanResults(results)
+
+	if entries != nil {
+		t.Errorf("expected nil entries on all-failed, got %v", entries)
+	}
+	if len(errs) != 3 {
+		t.Errorf("expected 3 error strings, got %d: %v", len(errs), errs)
+	}
+}
+
+// TestCollectScanResults_Empty guards against a degenerate empty-input case
+// (scanEndpoints returns no results, e.g. if ctx was cancelled before the
+// first probe ran). Both return slices must be nil.
+func TestCollectScanResults_Empty(t *testing.T) {
+	svc := &NetworkScanService{}
+	entries, errs := svc.collectScanResults(nil)
+	if entries != nil {
+		t.Errorf("expected nil entries for empty input, got %v", entries)
+	}
+	if errs != nil {
+		t.Errorf("expected nil errors for empty input, got %v", errs)
+	}
+}