From 4f90be931119d9e3766d441b43b835faef6e9817 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Tue, 24 Mar 2026 23:37:47 -0400 Subject: [PATCH] feat: add network certificate discovery (M21) and Prometheus metrics (M22) M21 adds server-side active TLS scanning of CIDR ranges with concurrent probing, sentinel agent pattern for pipeline reuse, and full CRUD API for scan targets. M22 adds Prometheus exposition format endpoint alongside existing JSON metrics. Comprehensive documentation audit updates all docs to reflect 91 endpoints, 19 tables, 6 scheduler loops, and 900+ tests. Co-Authored-By: Claude Opus 4.6 --- cmd/server/main.go | 25 + docs/architecture.md | 71 +-- docs/compliance-soc2.md | 15 +- docs/concepts.md | 4 +- docs/connectors.md | 78 ++++ docs/demo-advanced.md | 58 ++- docs/demo-guide.md | 10 +- docs/features.md | 76 ++- docs/quickstart.md | 34 +- internal/api/handler/metrics.go | 95 +++- internal/api/handler/network_scan.go | 179 +++++++ .../api/handler/network_scan_handler_test.go | 220 +++++++++ internal/api/handler/stats_handler_test.go | 114 +++++ internal/api/router/router.go | 10 + internal/config/config.go | 31 +- internal/domain/network_scan.go | 27 ++ internal/domain/network_scan_test.go | 67 +++ internal/integration/lifecycle_test.go | 29 ++ internal/integration/negative_test.go | 4 + internal/repository/interfaces.go | 18 + internal/repository/postgres/network_scan.go | 181 ++++++++ internal/scheduler/scheduler.go | 45 ++ internal/service/network_scan.go | 436 ++++++++++++++++++ internal/service/network_scan_test.go | 244 ++++++++++ migrations/000007_network_discovery.down.sql | 1 + migrations/000007_network_discovery.up.sql | 21 + 26 files changed, 2022 insertions(+), 71 deletions(-) create mode 100644 internal/api/handler/network_scan.go create mode 100644 internal/api/handler/network_scan_handler_test.go create mode 100644 internal/domain/network_scan.go create mode 100644 internal/domain/network_scan_test.go create mode 100644 internal/repository/postgres/network_scan.go create mode 100644 internal/service/network_scan.go create mode 100644 internal/service/network_scan_test.go create mode 100644 migrations/000007_network_discovery.down.sql create mode 100644 migrations/000007_network_discovery.up.sql diff --git a/cmd/server/main.go b/cmd/server/main.go index d0a716e..a269624 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -207,6 +207,24 @@ func main() { agentGroupService := service.NewAgentGroupService(agentGroupRepo, auditService) discoveryRepo := postgres.NewDiscoveryRepository(db) discoveryService := service.NewDiscoveryService(discoveryRepo, certificateRepo, auditService) + networkScanRepo := postgres.NewNetworkScanRepository(db) + networkScanService := service.NewNetworkScanService(networkScanRepo, discoveryService, auditService, logger) + logger.Info("initialized network scan service") + + // Ensure the sentinel "server-scanner" agent exists for network discovery dedup. + // This agent ID is used as the agent_id in discovered_certificates for network-scanned certs. + if cfg.NetworkScan.Enabled { + sentinelAgent := &domain.Agent{ + ID: service.SentinelAgentID, + Name: "Network Scanner (Server-Side)", + Status: domain.AgentStatusOnline, + } + if err := agentRepo.Create(context.Background(), sentinelAgent); err != nil { + // Ignore duplicate key errors (agent already exists) + logger.Debug("sentinel agent creation", "status", "exists or created", "id", service.SentinelAgentID) + } + } + logger.Info("initialized all services") // Initialize stats and metrics services @@ -230,6 +248,7 @@ func main() { metricsHandler := handler.NewMetricsHandler(statsService, time.Now()) healthHandler := handler.NewHealthHandler(cfg.Auth.Type) discoveryHandler := handler.NewDiscoveryHandler(discoveryService) + networkScanHandler := handler.NewNetworkScanHandler(networkScanService) logger.Info("initialized all handlers") // Create context with cancellation @@ -242,6 +261,7 @@ func main() { jobService, agentService, notificationService, + networkScanService, logger, ) @@ -250,6 +270,10 @@ func main() { sched.SetJobProcessorInterval(cfg.Scheduler.JobProcessorInterval) sched.SetAgentHealthCheckInterval(cfg.Scheduler.AgentHealthCheckInterval) sched.SetNotificationProcessInterval(cfg.Scheduler.NotificationProcessInterval) + if cfg.NetworkScan.Enabled { + sched.SetNetworkScanInterval(cfg.NetworkScan.ScanInterval) + logger.Info("network scanning enabled", "interval", cfg.NetworkScan.ScanInterval.String()) + } // Start scheduler logger.Info("starting scheduler") @@ -276,6 +300,7 @@ func main() { metricsHandler, healthHandler, discoveryHandler, + networkScanHandler, ) logger.Info("registered all API handlers") diff --git a/docs/architecture.md b/docs/architecture.md index 99444db..e1ebe45 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -25,12 +25,12 @@ flowchart TB API["REST API\n(Go net/http, :8443)"] SVC["Service Layer"] REPO["Repository Layer\n(database/sql + lib/pq)"] - SCHED["Background Scheduler\n5 loops"] + SCHED["Background Scheduler\n6 loops"] DASH["Web Dashboard\n(React SPA)"] end subgraph "Data Store" - PG[("PostgreSQL 16\n18 tables\nTEXT primary keys")] + PG[("PostgreSQL 16\n19 tables\nTEXT primary keys")] end subgraph "Agent Fleet" @@ -374,7 +374,7 @@ Short-lived certificates (those with profile TTL < 1 hour) return "good" from OC ### 4. Automatic Renewal -The control plane runs a scheduler with five background loops: +The control plane runs a scheduler with six background loops: ```mermaid flowchart LR @@ -384,6 +384,7 @@ flowchart LR H["Agent Health\n⏱ every 2m"] N["Notification Processor\n⏱ every 1m"] SL["Short-Lived Expiry\n⏱ every 30s"] + NS["Network Scanner\n⏱ every 6h"] end R -->|"Find expiring certs\nCreate renewal jobs"| DB[("PostgreSQL")] @@ -391,6 +392,7 @@ flowchart LR H -->|"Check heartbeat staleness\nMark agents offline"| DB N -->|"Send pending notifications\nEmail / Webhook / Slack"| DB SL -->|"Expire short-lived certs\nMark as Expired"| DB + NS -->|"Probe TLS endpoints\nStore discovered certs"| DB ``` | Loop | Interval | Timeout | Purpose | @@ -400,6 +402,7 @@ flowchart LR | Agent health check | 2 minutes | 1 minute | Marks agents as offline if heartbeat is stale | | Notification processor | 1 minute | 1 minute | Sends pending notifications via configured channels | | Short-lived expiry | 30 seconds | 30 seconds | Marks expired short-lived certificates (profile TTL < 1 hour) | +| Network scanner | 6 hours | 30 minutes | Probes TLS endpoints on configured CIDR ranges, stores discovered certs (M21, opt-in via `CERTCTL_NETWORK_SCAN_ENABLED`) | Each operation has a context timeout to prevent indefinite hangs if external services become unresponsive. @@ -605,7 +608,7 @@ All endpoints are under `/api/v1/` and follow consistent patterns: Resources: certificates, issuers, targets, agents, jobs, policies, profiles, teams, owners, agent-groups, audit, notifications. -The full API is documented in an OpenAPI 3.1 specification at `api/openapi.yaml` with 78 documented operations (including health, readiness, and auth endpoints; 7 discovery endpoints from M18b pending spec update), all request/response schemas, and pagination conventions. See the [OpenAPI Guide](openapi.md) for usage with Swagger UI and SDK generation. +The full API is documented in an OpenAPI 3.1 specification at `api/openapi.yaml` with 91 endpoints across 19 resource domains (including health, readiness, auth, 7 discovery endpoints from M18b, 6 network scan endpoints from M21, and Prometheus metrics from M22), all request/response schemas, and pagination conventions. See the [OpenAPI Guide](openapi.md) for usage with Swagger UI and SDK generation. Jobs support additional action endpoints: `POST /api/v1/jobs/{id}/cancel`, `POST /api/v1/jobs/{id}/approve`, `POST /api/v1/jobs/{id}/reject`. @@ -703,54 +706,64 @@ flowchart TB For production, you would also add an ingress controller, TLS termination for the certctl API itself, and external PostgreSQL (RDS, Cloud SQL, etc.). -## Discovery Data Flow (M18b) +## Discovery Data Flow (M18b + M21) -Certificate discovery enables operators to build a complete inventory of existing certificates before managing them with certctl. Here's how data flows through the system: +Certificate discovery enables operators to build a complete inventory of existing certificates before managing them with certctl. There are two discovery modes that feed into the same pipeline: ```mermaid flowchart TB - AGENT["certctl-agent\n(on infrastructure)"] - SCAN["Filesystem Scanner\n(CERTCTL_DISCOVERY_DIRS)"] + subgraph "Discovery Sources" + AGENT["certctl-agent\n(filesystem discovery)"] + SCAN["Filesystem Scanner\n(CERTCTL_DISCOVERY_DIRS)"] + SERVER["certctl-server\n(network discovery)"] + NETSCAN["TLS Scanner\n(CIDR ranges + ports)"] + end + EXTRACT["Extract Metadata\n(CN, SANs, serial, issuer, expiry, fingerprint)"] - REPORT["POST /api/v1/agents/{id}/discoveries\n(submit scan results)"] - HANDLER["Discovery Handler\n(parse request)"] SERVICE["Discovery Service\n(ProcessDiscoveryReport)"] REPO["Discovery Repository\n(upsert with fingerprint dedup)"] DB["PostgreSQL\ndiscovered_certificates\ndiscovery_scans tables"] AUDIT["Audit Service\n(RecordDiscoveryScanCompleted)"] API_LIST["GET /api/v1/discovered-certificates\n(list for triage)"] - API_CLAIM["POST /discovered-certificates/{id}/claim\n(operator claims cert)"] - API_DISMISS["POST /discovered-certificates/{id}/dismiss\n(operator dismisses)"] - UPDATE_STATUS["Update Status\n(Unmanaged → Managed/Dismissed)"] + API_CLAIM["POST /discovered-certificates/{id}/claim"] + API_DISMISS["POST /discovered-certificates/{id}/dismiss"] AGENT -->|"Scan loop\n(startup + 6h)"| SCAN SCAN --> EXTRACT - EXTRACT --> REPORT - REPORT --> HANDLER - HANDLER --> SERVICE + SERVER -->|"Scheduler loop\n(every 6h)"| NETSCAN + NETSCAN -->|"crypto/tls.Dial\n50 goroutines"| EXTRACT + EXTRACT --> SERVICE SERVICE --> REPO - REPO -->|"Dedup by fingerprint\n+ agent + path"| DB + REPO -->|"Dedup by fingerprint\n+ agent_id + source_path"| DB SERVICE --> AUDIT - AUDIT -->|"discovery_scan_completed"| DB - DB -->|"query unmanaged"| API_LIST - API_LIST -->|"operator reviews"| API_CLAIM - API_LIST -->|"operator reviews"| API_DISMISS - API_CLAIM --> UPDATE_STATUS - API_DISMISS --> UPDATE_STATUS - UPDATE_STATUS -->|"RecordDiscoveryCertClaimed\nRecordDiscoveryCertDismissed"| AUDIT AUDIT --> DB + DB --> API_LIST + API_LIST --> API_CLAIM + API_LIST --> API_DISMISS ``` -**Key steps:** +**Filesystem Discovery (M18b):** 1. **Agent-side discovery** — Agent scans `CERTCTL_DISCOVERY_DIRS` on startup and every 6 hours, walking directories recursively and parsing PEM/DER files 2. **Metadata extraction** — For each certificate found, extract: common name, SANs, serial number, issuer DN, subject DN, expiration date, key algorithm, key size, is_ca flag, SHA-256 fingerprint (used as dedup key) 3. **Server submission** — Agent POSTs scan results as `DiscoveryReport` to `POST /api/v1/agents/{id}/discoveries` 4. **Deduplication** — Server uses fingerprint + agent ID + filesystem path as unique key; prevents duplicate records of the same cert on the same agent -5. **Storage** — Records stored in `discovered_certificates` table with status = "Unmanaged" -6. **Audit** — `discovery_scan_completed` event logged with agent ID, cert count, scan timestamp -7. **Operator triage** — Operator queries `GET /api/v1/discovered-certificates?status=Unmanaged` to see new findings -8. **Claim or dismiss** — For each unmanaged cert, operator either: + +**Network Discovery (M21):** + +1. **Target configuration** — Operator creates network scan targets via `POST /api/v1/network-scan-targets` with CIDR ranges, ports, and scan interval +2. **CIDR expansion** — Ranges expanded to individual IPs with /20 safety cap (4096 IPs max) +3. **TLS probing** — Server uses `crypto/tls.DialWithDialer` with `InsecureSkipVerify=true` to connect to each endpoint; 50 concurrent goroutines with configurable timeout +4. **Certificate extraction** — Full X.509 metadata extracted from TLS handshake peer certificates +5. **Sentinel agent** — Results submitted using `server-scanner` as virtual agent ID, with `source_path` set to `ip:port` and `source_format` set to `network` +6. **Same pipeline** — Feeds into the same `DiscoveryService.ProcessDiscoveryReport()` as filesystem discovery — same dedup, same audit trail, same triage workflow + +**Common triage workflow (both sources):** + +1. **Storage** — Records stored in `discovered_certificates` table with status = "Unmanaged" +2. **Audit** — `discovery_scan_completed` event logged with agent ID, cert count, scan timestamp +3. **Operator triage** — Operator queries `GET /api/v1/discovered-certificates?status=Unmanaged` to see new findings +4. **Claim or dismiss** — For each unmanaged cert, operator either: - **Claims it** via `POST /discovered-certificates/{id}/claim` — links to existing managed cert or creates new enrollment - **Dismisses it** via `POST /discovered-certificates/{id}/dismiss` — removes from triage, marked as "Dismissed" 9. **Status tracking** — `discovery_cert_claimed` and `discovery_cert_dismissed` events audit the operator's decision diff --git a/docs/compliance-soc2.md b/docs/compliance-soc2.md index bdbaf42..bd91179 100644 --- a/docs/compliance-soc2.md +++ b/docs/compliance-soc2.md @@ -160,17 +160,20 @@ Each section includes: - **Health Endpoint** — `GET /health` returns 200 OK with service status. Consumed by Docker health checks and Kubernetes probes. - **Readiness Endpoint** — `GET /ready` returns 200 OK when the database is connected and migrations are applied. -- **Background Scheduler Monitoring** — 5 background loops run on a fixed schedule: +- **Background Scheduler Monitoring** — 6 background loops run on a fixed schedule: - Renewal loop: every 1 hour, scans for certificates approaching renewal threshold - Job processor loop: every 30 seconds, picks up pending/waiting jobs and advances their state - Health check loop: every 2 minutes, pings agents to detect downtime - Notification dispatcher loop: every 1 minute, sends queued alerts - Short-lived cert expiry loop: every 30 seconds, marks expired short-lived credentials + - Network scanner loop: every 6 hours, scans enabled TLS endpoints for certificate discovery Each loop includes error handling and logs failures via structured slog. -- **JSON Metrics Endpoint** — `GET /api/v1/metrics` returns JSON object with: - - **Gauges** — `certificates_total`, `certificates_active`, `certificates_expiring_soon`, `agents_total`, `agents_healthy`, `pending_jobs`, `failed_jobs` - - **Counters** — `certs_issued_total`, `certs_renewed_total`, `certs_revoked_total`, `deployments_completed_total`, `deployments_failed_total` - - **Uptime** — `uptime_seconds` (seconds since server start) +- **Metrics Endpoints** — Two formats for monitoring integration: + - `GET /api/v1/metrics` — JSON object with gauges, counters, and uptime for custom dashboards + - `GET /api/v1/metrics/prometheus` — Prometheus exposition format (`text/plain; version=0.0.4`) for native scraping by Prometheus, Grafana Agent, Datadog, and other OpenMetrics-compatible collectors + - **Gauges** — `certctl_certificate_total`, `certctl_certificate_active`, `certctl_certificate_expiring`, `certctl_certificate_expired`, `certctl_certificate_revoked`, `certctl_agent_total`, `certctl_agent_active`, `certctl_job_pending` + - **Counters** — `certctl_job_completed_total`, `certctl_job_failed_total` + - **Uptime** — `certctl_uptime_seconds` (seconds since server start) All values are point-in-time snapshots computed from database tables. - **Structured Logging** — All scheduler operations, API calls, and connector actions log via `slog` (Go's structured logger). Logs include timestamp, level (DEBUG/INFO/WARN/ERROR), structured fields (e.g., `actor`, `resource_id`, `latency_ms`), and request IDs for tracing. - **Request ID Propagation** — Each HTTP request gets a unique ID (`X-Request-ID` header). The ID is included in all correlated logs, making it easy to trace a single request through multiple service layers. @@ -426,7 +429,7 @@ Each section includes: | | Metrics JSON Endpoint | `GET /api/v1/metrics` (gauges, counters, uptime) | ✅ | ✅ | Set thresholds, configure alerting | | | Stats API (time-series) | `GET /api/v1/stats/*` (summary, status, expiration, jobs, issuance) | ✅ | ✅ | Integrate into dashboards, SLO tracking | | | Structured Logging | `slog` middleware with request IDs | ✅ | ✅ | Aggregate logs to SIEM, define retention policy | -| | Background Scheduler | 5 loops (renewal 1h, jobs 30s, health 2m, notifications 1m, short-lived 30s) | ✅ | ✅ | Alert on scheduler loop failures | +| | Background Scheduler | 6 loops (renewal 1h, jobs 30s, health 2m, notifications 1m, short-lived 30s, network scan 6h) | ✅ | ✅ | Alert on scheduler loop failures | | **CC7.2** Anomaly Detection | Immutable API Audit Trail | `internal/api/middleware/audit.go`, `GET /api/v1/audit` | ✅ | Enhanced (SIEM export) | Integrate into SIEM, search for anomalies, archive long-term | | | Expiration Threshold Alerting | Configurable per-policy (default 30/14/7/0 days) | ✅ | ✅ | Configure thresholds, integrate notifications | | | Status Auto-Transitions | Active → Expiring (30d) → Expired (0d) | ✅ | ✅ | Monitor status changes in audit trail | diff --git a/docs/concepts.md b/docs/concepts.md index d4f7474..39d369b 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -194,7 +194,7 @@ The MCP server is a separate binary (`cmd/mcp-server/`) that communicates via st Certificate discovery is the process of automatically finding existing certificates in your infrastructure — certificates you didn't issue through certctl, possibly issued by other CAs or tools. This is essential for building a complete inventory before you can manage everything. -**How it works:** Agents can scan configured directories (configured via `CERTCTL_DISCOVERY_DIRS`) for certificate files. On startup and every 6 hours, the agent walks these directories recursively, parses PEM and DER files, extracts metadata (common name, SANs, expiration, issuer, key algorithm), and reports all findings to the control plane. The server deduplicates by fingerprint (prevents duplicate reports of the same cert) and stores them with a status: **Unmanaged** (discovered but not yet managed), **Managed** (linked to a control plane cert), or **Dismissed** (operator decided not to manage it). +**How it works:** There are two discovery modes. *Filesystem discovery* — agents scan configured directories (configured via `CERTCTL_DISCOVERY_DIRS`) for certificate files. On startup and every 6 hours, the agent walks directories recursively, parses PEM and DER files, extracts metadata, and reports findings to the control plane. *Network discovery* — the control plane itself probes TLS endpoints across configured CIDR ranges and ports (enabled via `CERTCTL_NETWORK_SCAN_ENABLED=true`). It connects to each endpoint, extracts certificates from the TLS handshake, and feeds results into the same discovery pipeline. This finds certificates on services you may not have agents on. In both cases, the server deduplicates by fingerprint and stores discovered certs with a status: **Unmanaged** (discovered but not yet managed), **Managed** (linked to a control plane cert), or **Dismissed** (operator decided not to manage it). This gives you a three-step triage workflow: 1. **Discover** — Agents find all existing certs on your infrastructure @@ -205,7 +205,7 @@ This is a prerequisite for multi-CA migration, compliance audits, and building c ### Observability -certctl exposes a JSON metrics endpoint at `GET /api/v1/metrics` with gauges (certificate totals by status, agent counts, pending jobs), counters (completed/failed jobs), and uptime. Five stats endpoints power the dashboard charts: summary statistics, certificates by status, expiration timeline, job trends, and issuance rate. +certctl exposes metrics in two formats: a JSON endpoint at `GET /api/v1/metrics` and a Prometheus exposition format at `GET /api/v1/metrics/prometheus` (compatible with Prometheus, Grafana Agent, Datadog Agent, and Victoria Metrics). Both provide gauges (certificate totals by status, agent counts, pending jobs), counters (completed/failed jobs), and uptime. Five stats endpoints power the dashboard charts: summary statistics, certificates by status, expiration timeline, job trends, and issuance rate. The agent fleet overview page groups agents by OS, architecture, and version, showing distribution charts that help ops teams track fleet health and identify outdated agents. All API requests are logged via structured `slog` middleware with request IDs for correlation. diff --git a/docs/connectors.md b/docs/connectors.md index 8836539..8dcd4f7 100644 --- a/docs/connectors.md +++ b/docs/connectors.md @@ -639,6 +639,84 @@ curl -s http://localhost:8443/api/v1/discovery-summary | jq . - **Compliance** — Detect rogue/unauthorized certificates in monitored directories - **Integration** — Pull certificate data from systems that pre-generate certs (e.g., Kubernetes CertManager) +## Network Certificate Scanner (M21) + +The control plane includes a built-in active TLS scanner that probes network endpoints and discovers certificates without requiring agent deployment. This complements the agent-based filesystem discovery with network-level visibility. + +### Configuration + +Enable network scanning on the server: + +```bash +export CERTCTL_NETWORK_SCAN_ENABLED=true +export CERTCTL_NETWORK_SCAN_INTERVAL=6h # default +``` + +### Creating Scan Targets + +Network scan targets define which CIDR ranges and ports to probe: + +```bash +# Create a scan target for your internal network +curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Production Web Servers", + "cidrs": ["10.0.1.0/24", "10.0.2.0/24"], + "ports": [443, 8443, 6443], + "enabled": true, + "scan_interval_hours": 6, + "timeout_ms": 5000 + }' | jq . +``` + +### How It Works + +1. **Expand**: CIDR ranges are expanded to individual IPs (safety cap at /20 = 4096 IPs) +2. **Probe**: Concurrent TLS connections (50 goroutines) with configurable timeout per endpoint +3. **Extract**: Certificate metadata extracted from TLS handshake (CN, SANs, serial, issuer, key info, fingerprint) +4. **Pipeline**: Results fed into the same `DiscoveryService.ProcessDiscoveryReport()` as filesystem discovery +5. **Deduplicate**: Sentinel agent ID (`server-scanner`) with source_path as `ip:port` ensures proper dedup +6. **Triage**: Discovered certs appear in `GET /api/v1/discovered-certificates` with `agent_id=server-scanner` + +### API Endpoints + +```bash +# List all scan targets +curl -s http://localhost:8443/api/v1/network-scan-targets | jq . + +# Create a scan target +curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \ + -H "Content-Type: application/json" \ + -d '{"name": "DMZ", "cidrs": ["172.16.0.0/24"], "ports": [443]}' | jq . + +# Get a specific target (includes last_scan_at, last_scan_certs_found) +curl -s http://localhost:8443/api/v1/network-scan-targets/nst-dmz | jq . + +# Trigger an immediate scan (doesn't wait for scheduler) +curl -s -X POST http://localhost:8443/api/v1/network-scan-targets/nst-dmz/scan | jq . + +# Update scan configuration +curl -s -X PUT http://localhost:8443/api/v1/network-scan-targets/nst-dmz \ + -H "Content-Type: application/json" \ + -d '{"ports": [443, 8443, 9443], "timeout_ms": 3000}' | jq . + +# Delete a scan target +curl -s -X DELETE http://localhost:8443/api/v1/network-scan-targets/nst-dmz +``` + +### Scheduler Integration + +When `CERTCTL_NETWORK_SCAN_ENABLED=true`, the server runs a 6th scheduler loop (alongside renewal, jobs, health, notifications, and short-lived expiry). It scans all enabled targets at the configured interval (default 6h). Each target tracks `last_scan_at`, `last_scan_duration_ms`, and `last_scan_certs_found` for monitoring scan health. + +### Use Cases + +- **Network inventory** — "What TLS certs are deployed across my network?" without deploying agents +- **Shadow certificate detection** — Find certificates on services you didn't know were running TLS +- **Compliance scanning** — Prove to auditors that all TLS endpoints are inventoried +- **Migration assessment** — Scan a network range before onboarding to certctl management +- **Expiration monitoring** — Discover soon-to-expire certs on network endpoints before they cause outages + ## What's Next - [Architecture Guide](architecture.md) — Understanding the full system design diff --git a/docs/demo-advanced.md b/docs/demo-advanced.md index eda771f..0aa56b4 100644 --- a/docs/demo-advanced.md +++ b/docs/demo-advanced.md @@ -695,11 +695,14 @@ curl -s "$API/api/v1/stats/job-trends?days=30" | jq . # Issuance rate — new certificates per day over 30 days curl -s "$API/api/v1/stats/issuance-rate?days=30" | jq . -# System metrics — gauges, counters, uptime +# System metrics — gauges, counters, uptime (JSON) curl -s $API/api/v1/metrics | jq . + +# System metrics — Prometheus exposition format (for Prometheus/Grafana/Datadog scraping) +curl -s $API/api/v1/metrics/prometheus ``` -**How it works:** The `StatsService` computes aggregations in Go from existing repository List methods — no additional SQL queries or materialized views. This keeps the database schema simple while providing real-time dashboard data. The metrics endpoint returns gauges (cert totals by status, agent counts, pending jobs), counters (completed/failed jobs), and server uptime. +**How it works:** The `StatsService` computes aggregations in Go from existing repository List methods — no additional SQL queries or materialized views. This keeps the database schema simple while providing real-time dashboard data. The JSON metrics endpoint returns gauges (cert totals by status, agent counts, pending jobs), counters (completed/failed jobs), and server uptime. The Prometheus endpoint (`/api/v1/metrics/prometheus`) exposes the same data in Prometheus exposition format (`text/plain; version=0.0.4`) with `certctl_` prefixed metric names — ready for scraping by Prometheus, Grafana Agent, Datadog Agent, or Victoria Metrics. **In the dashboard**, these stats power four interactive charts: an expiration heatmap, renewal success rate trends, certificate status distribution, and issuance rate. The agent fleet overview page uses agent metadata to group by OS, architecture, and version. @@ -916,11 +919,13 @@ The MCP server is perfect for: --- -## Part 16: Certificate Discovery (M18b) +## Part 16: Certificate Discovery (M18b + M21) -Agents can automatically discover existing certificates already deployed in your infrastructure. This is useful for building a baseline inventory before you start managing everything with certctl. +certctl discovers existing certificates two ways: **filesystem scanning** (agents scan local directories) and **network scanning** (the server probes TLS endpoints). Both feed into the same triage pipeline. -First, configure the demo agent to scan for certificates. In the Docker Compose setup, agents have a `/tmp/certs` directory (created by the seed script). Restart the agent with discovery enabled: +### Filesystem Discovery (Agent-Side) + +Configure the demo agent to scan for certificates. In the Docker Compose setup, agents have a `/tmp/certs` directory (created by the seed script). Restart the agent with discovery enabled: ```bash # Stop the existing agent @@ -936,17 +941,46 @@ Or with the CLI flag: certctl-agent --agent-id a-demo-1 --key-dir /tmp/keys --discovery-dirs /tmp/certs --server http://localhost:8443 --api-key test-key-123 ``` -Now check what the agent discovered: +### Network Discovery (Server-Side) + +The server can also discover certificates by actively probing TLS endpoints — no agent required. Create a scan target and trigger a scan: ```bash -# List discovered certificates (should show unmanaged certs found on the agent) +# Create a network scan target +curl -s -X POST $API/api/v1/network-scan-targets \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Demo Local Scan", + "cidrs": ["127.0.0.1/32"], + "ports": [8443], + "enabled": true, + "scan_interval_hours": 6, + "timeout_ms": 5000 + }' | jq . + +# Trigger an immediate scan (otherwise runs every 6 hours) +NST_ID=$(curl -s $API/api/v1/network-scan-targets | jq -r '.data[0].id') +curl -s -X POST "$API/api/v1/network-scan-targets/$NST_ID/scan" | jq . + +# List scan targets and their results +curl -s $API/api/v1/network-scan-targets | jq . +``` + +Network-discovered certificates appear in the same discovery pipeline as filesystem-discovered ones, with `agent_id=server-scanner` and `source_format=network`. + +### Triage Discovered Certificates + +Both discovery sources feed into the same triage workflow. Check what was found: + +```bash +# List discovered certificates (should show unmanaged certs found by agents and network scans) curl -s "$API/api/v1/discovered-certificates?status=Unmanaged" | jq '.data[] | {id, common_name, expires_at, issuer_dn, status}' # Get a summary of all discoveries curl -s $API/api/v1/discovery-summary | jq . ``` -If the agent found certificates, you'll see entries with `status: "Unmanaged"`. Now triage them — claim the ones you want to manage or dismiss the ones you don't: +If certificates were found, you'll see entries with `status: "Unmanaged"`. Triage them — claim the ones you want to manage or dismiss the ones you don't: ```bash # Claim a certificate (link it to a managed cert, or create new enrollment) @@ -961,9 +995,9 @@ curl -s -X POST "$API/api/v1/discovered-certificates/$DISCOVERED_ID/dismiss" \ -d '{"reason": "Self-signed test cert, not production"}' | jq . ``` -**How it works:** The agent scans `CERTCTL_DISCOVERY_DIRS` on startup and every 6 hours, extracts metadata (common name, SANs, issuer, expiration, key type, fingerprint) from all PEM and DER files, and POSTs the findings to `POST /api/v1/agents/{id}/discoveries`. The server deduplicates by fingerprint (prevents duplicate records) and stores results with a status: **Unmanaged** (discovered, not yet managed), **Managed** (linked to a control plane cert), or **Dismissed** (operator decided not to manage). This gives you a triage workflow: discover → review → claim or dismiss. +**How it works:** Filesystem discovery: the agent scans `CERTCTL_DISCOVERY_DIRS` on startup and every 6 hours, extracts metadata (common name, SANs, issuer, expiration, key type, fingerprint) from all PEM and DER files, and POSTs findings to `POST /api/v1/agents/{id}/discoveries`. Network discovery: the server expands CIDR ranges (capped at /20 = 4096 IPs), connects to each IP:port via TLS, extracts the peer certificate chain, and stores results using `server-scanner` as a sentinel agent ID. Both sources deduplicate by fingerprint and store results with a status: **Unmanaged** (discovered, not yet managed), **Managed** (linked to a control plane cert), or **Dismissed** (operator decided not to manage). This gives you a triage workflow: discover → review → claim or dismiss. -**In the dashboard**, the Discovery page (coming in future V2.x) will provide a visual triage interface for claiming and dismissing discovered certificates. +**In the dashboard**, click "Discovered Certificates" in the sidebar to see what agents and network scans found — claim unmanaged certs to bring them under certctl's management, or dismiss them. --- @@ -989,12 +1023,12 @@ flowchart TB API["REST API\nGo net/http"] SVC["Service Layer\nBusiness Logic"] REPO["Repository Layer\ndatabase/sql + lib/pq"] - SCHED["Scheduler\n5 background loops"] + SCHED["Scheduler\n6 background loops"] CONN["Connector Registry\nIssuer + Target + Notifier"] end subgraph "Data Store" - PG["PostgreSQL 16\n18 tables, TEXT PKs"] + PG["PostgreSQL 16\n19 tables, TEXT PKs"] end subgraph "Agent (certctl-agent)" diff --git a/docs/demo-guide.md b/docs/demo-guide.md index b02ad4d..9d036b9 100644 --- a/docs/demo-guide.md +++ b/docs/demo-guide.md @@ -70,11 +70,11 @@ On the Certificates page, select multiple certificates using the checkboxes. A b Click any certificate, then scroll to the deployment timeline. A visual 4-step timeline shows the lifecycle: Requested → Issued → Deploying → Active. Previous versions show a rollback button. **11. "What about certificates already running in production?"** -Enable discovery on agents by setting `CERTCTL_DISCOVERY_DIRS` to directories containing certificates (e.g., `/etc/nginx/certs`). Agents scan on startup and every 6 hours, report findings to the control plane. Click "Discovered Certificates" to see what agents found — claim unmanaged certs to bring them under certctl's management, or dismiss them. +Enable discovery on agents by setting `CERTCTL_DISCOVERY_DIRS` to directories containing certificates (e.g., `/etc/nginx/certs`). Agents scan on startup and every 6 hours, report findings to the control plane. For network-based discovery without agents, enable `CERTCTL_NETWORK_SCAN_ENABLED=true` and configure scan targets via the API — the server probes TLS endpoints on configured CIDR ranges and ports. Click "Discovered Certificates" to see what agents and network scans found — claim unmanaged certs to bring them under certctl's management, or dismiss them. ## REST API Walkthrough -The dashboard is backed by a real REST API (84 endpoints). Try these while the demo is running: +The dashboard is backed by a real REST API (91 endpoints). Try these while the demo is running: ```bash # List all certificates @@ -114,6 +114,7 @@ curl -s http://localhost:8443/api/v1/stats/expiration-timeline | jq . curl -s http://localhost:8443/api/v1/stats/job-trends | jq . curl -s http://localhost:8443/api/v1/stats/issuance-rate | jq . curl -s http://localhost:8443/api/v1/metrics | jq . +curl -s http://localhost:8443/api/v1/metrics/prometheus # Prometheus format # Certificate profiles curl -s http://localhost:8443/api/v1/profiles | jq . @@ -135,6 +136,9 @@ curl -s http://localhost:8443/api/v1/discovered-certificates | jq . # Discovery summary (counts by status) curl -s http://localhost:8443/api/v1/discovery-summary | jq . + +# Network scan targets (active TLS scanning) +curl -s http://localhost:8443/api/v1/network-scan-targets | jq . ``` ## CLI Tool @@ -236,7 +240,7 @@ If you're demoing to a team or customer, here's a suggested flow: 7. **Show profiles** — "Certificate profiles enforce crypto constraints — key types, max TTL, compliance requirements" 8. **Show policies** — "Guardrails prevent teams from going outside approved scope" 9. **Show bulk operations** — "Select multiple certs, trigger renewal or revoke in bulk with progress tracking" -10. **Show certificate discovery** — "Agents scan your infrastructure for existing certificates you're not managing yet. We automatically deduplicate by fingerprint, show you what we found, and let you claim them or dismiss them" +10. **Show certificate discovery** — "We discover certificates two ways: agents scan local filesystems, and the server actively probes TLS endpoints on your network. We deduplicate by fingerprint, show you what we found, and let you claim them or dismiss them" 11. **Show the immutable audit trail** — "Every action in the system is recorded: who did it, what they did, when, what changed. Export to CSV/JSON for compliance" 12. **Show advanced query features** — "Sort by any field, filter by date range, paginate efficiently with cursor-based pagination, select just the fields you need" 13. **Show the CLI and MCP server** — "Terminal users get `certctl-cli` with 10 subcommands. AI assistants get MCP integration with 76 tools. Everything is API-first" diff --git a/docs/features.md b/docs/features.md index dfeafca..c8eca9f 100644 --- a/docs/features.md +++ b/docs/features.md @@ -7,7 +7,7 @@ Complete reference of all features shipped in the V2 release (as of March 2026). ## API Surface ### Overview -- **84 endpoints** across 17 resource domains under `/api/v1/` +- **91 endpoints** across 19 resource domains under `/api/v1/` - REST API with HTTP semantics (GET, POST, PUT, DELETE) - All endpoints require authentication by default (configurable) - OpenAPI 3.1 spec with full schema documentation @@ -55,10 +55,11 @@ Complete reference of all features shipped in the V2 release (as of March 2026). | **Owners** | 5 | List, create, get, update, delete | | **Agent Groups** | 6 | List, create, get, update, delete, list agents in group | | **Discovery** | 7 | Submit scan results, list discovered certs, get detail, claim, dismiss, list scans, summary stats | +| **Network Scan** | 6 | List targets, create, get, update, delete, trigger scan | | **Audit** | 3 | List events, list by resource, export (CSV/JSON) | | **Notifications** | 3 | List, get, mark as read | | **Stats** | 5 | Dashboard summary, certificates by status, expiration timeline, job trends, issuance rate | -| **Metrics** | 1 | JSON metrics (gauges, counters, uptime) | +| **Metrics** | 2 | JSON metrics (gauges, counters, uptime), Prometheus exposition format | | **Health** | 4 | Health check, readiness check, auth info, auth check | --- @@ -411,6 +412,60 @@ Each discovered certificate is parsed and its metadata extracted: --- +## Network Certificate Discovery (M21) + +### Overview +Server-side active TLS scanning probes network endpoints across CIDR ranges, extracts certificate metadata from TLS handshakes, and feeds results into the existing filesystem discovery pipeline. No agent deployment required — the control plane scans directly. + +### Configuration +- **Enable** — `CERTCTL_NETWORK_SCAN_ENABLED=true` (disabled by default) +- **Scan Interval** — `CERTCTL_NETWORK_SCAN_INTERVAL=6h` (default 6 hours, configurable) + +### Network Scan Targets +Scan targets define what CIDR ranges and ports to probe. + +| Field | Details | Example | +|-------|---------|---------| +| **ID** | Prefixed text PK (nst-xxx) | nst-datacenter-east | +| **Name** | Human-readable target name | Datacenter East Production | +| **CIDRs** | Array of CIDR ranges | ["10.0.1.0/24", "10.0.2.0/24"] | +| **Ports** | Array of TCP ports | [443, 8443, 6443] | +| **Enabled** | Toggle scanning on/off | true | +| **Scan Interval Hours** | Per-target scan frequency | 6 | +| **Timeout Ms** | Per-connection timeout | 5000 | + +### Scanning Behavior +- **CIDR Expansion** — Ranges expanded to individual IPs; safety cap at /20 (4096 IPs) prevents accidental large scans +- **Concurrent Probing** — 50 goroutines (semaphore-based), configurable timeout per TLS connection +- **TLS Extraction** — `crypto/tls.DialWithDialer` with `InsecureSkipVerify=true` discovers all certs including self-signed, expired, and internal CA certs +- **Sentinel Agent Pattern** — Uses `server-scanner` as virtual agent ID, reusing the existing `discovered_certificates` dedup constraint without schema changes +- **Discovery Pipeline** — Scan results feed into `DiscoveryService.ProcessDiscoveryReport()` for fingerprint dedup, audit trail, and triage workflow + +### Network Scan API Endpoints (M21) + +| Endpoint | Method | Purpose | +|----------|--------|---------| +| `/api/v1/network-scan-targets` | GET | List all scan targets with metrics | +| `/api/v1/network-scan-targets` | POST | Create a new scan target | +| `/api/v1/network-scan-targets/{id}` | GET | Get scan target details | +| `/api/v1/network-scan-targets/{id}` | PUT | Update scan target configuration | +| `/api/v1/network-scan-targets/{id}` | DELETE | Delete a scan target | +| `/api/v1/network-scan-targets/{id}/scan` | POST | Trigger an immediate scan | + +### Scheduler Integration +- **6th scheduler loop** — runs at configured interval (default 6h) alongside renewal (1h), jobs (30s), health (2m), notifications (1m), short-lived expiry (30s) +- **Conditional** — only starts if `CERTCTL_NETWORK_SCAN_ENABLED=true` and network scan service is initialized +- **Scan Metrics** — each target tracks `last_scan_at`, `last_scan_duration_ms`, `last_scan_certs_found` + +### Use Cases +- **Network Inventory** — "What TLS certs are deployed across my network?" without deploying agents +- **Shadow Certificate Detection** — Find certificates on services you didn't know were running TLS +- **Compliance Scanning** — Prove to auditors that all TLS endpoints are inventoried +- **Migration Assessment** — Scan a network range before onboarding to certctl management +- **Expiration Monitoring** — Discover soon-to-expire certs on network endpoints before they cause outages + +--- + ## Ownership & Accountability ### Teams @@ -451,13 +506,23 @@ Live aggregated views of certificate and job metrics. | **Certificate Status Distribution** | Donut | Pie breakdown: Active, Expiring, Expired, Failed, Revoked, etc. | | **Issuance Rate** | Bar (30-day) | Certs issued per day; trend line | -#### Metrics Endpoint +#### Metrics Endpoints + +**JSON Format** - **URL** — `GET /api/v1/metrics` - **Format** — JSON with timestamp - **Gauges** — Certificate counts by status, agent count (online/offline), pending job count - **Counters** — Total jobs completed, total jobs failed, total renewals, total issuances - **Uptime** — Server uptime in seconds +**Prometheus Exposition Format (M22)** +- **URL** — `GET /api/v1/metrics/prometheus` +- **Content-Type** — `text/plain; version=0.0.4; charset=utf-8` +- **Compatible with** — Prometheus, Grafana Agent, Datadog Agent, Victoria Metrics, OpenMetrics scrapers +- **Naming** — `certctl_` prefix, snake_case (e.g., `certctl_certificate_total`, `certctl_agent_online`) +- **11 Metrics** — 8 gauges (cert total/active/expiring/expired/revoked, agent total/online, job pending), 2 counters (job completed/failed totals), 1 gauge (uptime seconds) +- **Scrape Config** — Add to `prometheus.yml`: `scrape_configs: [{job_name: certctl, static_configs: [{targets: ['localhost:8443']}], metrics_path: /api/v1/metrics/prometheus}]` + #### Stats API (M14) Five parameterized endpoints for dashboard data. @@ -541,7 +606,7 @@ Every API call recorded to immutable `audit_events` table. 3. **Approve** → `POST /api/v1/jobs/{id}/approve` → Job → `Running` 4. **Reject** → `POST /api/v1/jobs/{id}/reject` + reason → Job → `Cancelled` -### Background Scheduler (5 loops) +### Background Scheduler (6 loops) | Loop | Interval | Task | |------|----------|------| | **Renewal Checker** | 1 hour | Scan policies; trigger renewals if cert expires soon | @@ -549,6 +614,7 @@ Every API call recorded to immutable `audit_events` table. | **Health Checker** | 2 minutes | Check agent heartbeat; mark offline if >3 missed | | **Notification Processor** | 1 minute | Send queued notifications (email, Slack, webhook, etc.) | | **Short-Lived Cleanup** | 30 seconds | Audit short-lived credential expirations | +| **Network Scanner** | 6 hours | Scan enabled network targets; discover TLS certificates | All loops have configurable intervals via environment variables (`CERTCTL_SCHEDULER_*_INTERVAL`). @@ -898,7 +964,7 @@ Each guide includes an evidence summary table mapping specific criteria to certc | Revocation (RFC 5280, CRL, OCSP) | ✓ | ✓ | Shipped | | Dashboard + 19 pages | ✓ | ✓ | Shipped | | Observability (charts, metrics, stats) | ✓ | ✓ | Shipped | -| REST API (84 endpoints) | ✓ | ✓ | Shipped | +| REST API (91 endpoints) | ✓ | ✓ | Shipped | | MCP server (76 tools) | ✓ | ✓ | Shipped v2.1 | | CLI tool (10 subcommands) | ✓ | ✓ | Shipped | | Compliance mapping docs (SOC 2, PCI-DSS, NIST) | ✓ | ✓ | Shipped | diff --git a/docs/quickstart.md b/docs/quickstart.md index 02778b6..9a2dca8 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -295,8 +295,11 @@ curl -s "http://localhost:8443/api/v1/stats/expiration-timeline?days=90" | jq . # Job trends (last 30 days) curl -s "http://localhost:8443/api/v1/stats/job-trends?days=30" | jq . -# System metrics +# System metrics (JSON) curl -s http://localhost:8443/api/v1/metrics | jq . + +# System metrics (Prometheus format — for scraping by Prometheus, Grafana Agent, Datadog) +curl -s http://localhost:8443/api/v1/metrics/prometheus ``` ### Certificate profiles @@ -364,6 +367,35 @@ curl -s -X POST "http://localhost:8443/api/v1/discovered-certificates/DISCOVERY_ -d '{"managed_certificate_id": "mc-api-prod"}' | jq . ``` +### Network Certificate Discovery + +The server can also discover certificates by scanning TLS endpoints directly — no agent required: + +```bash +# Enable network scanning (set in environment or docker-compose) +export CERTCTL_NETWORK_SCAN_ENABLED=true + +# Create a scan target (e.g., scan your internal network on port 443) +curl -s -X POST http://localhost:8443/api/v1/network-scan-targets \ + -H "Content-Type: application/json" \ + -d '{ + "name": "Internal Network", + "cidrs": ["10.0.1.0/24"], + "ports": [443, 8443], + "enabled": true, + "scan_interval_hours": 6, + "timeout_ms": 5000 + }' | jq . + +# Trigger an immediate scan +curl -s -X POST http://localhost:8443/api/v1/network-scan-targets/nst-internal-network/scan | jq . + +# List scan targets with results +curl -s http://localhost:8443/api/v1/network-scan-targets | jq . +``` + +Discovered network certificates appear in the same `GET /api/v1/discovered-certificates` list as filesystem-discovered certs, with `agent_id=server-scanner` and `source_format=network`. + ## What's Next - **[Advanced Demo](demo-advanced.md)** — Issue a real certificate via the Local CA and watch it appear in the dashboard diff --git a/internal/api/handler/metrics.go b/internal/api/handler/metrics.go index ff5776c..51f7836 100644 --- a/internal/api/handler/metrics.go +++ b/internal/api/handler/metrics.go @@ -3,6 +3,7 @@ package handler import ( "context" "encoding/json" + "fmt" "net/http" "time" @@ -14,9 +15,9 @@ type MetricsService interface { GetDashboardSummary(ctx context.Context) (interface{}, error) } -// MetricsHandler handles HTTP requests for Prometheus-style metrics. -// In V2, returns JSON metrics (not Prometheus format). -// Prometheus format can be added in V3 when observability becomes a paid feature. +// MetricsHandler handles HTTP requests for metrics. +// Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format +// (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc. type MetricsHandler struct { svc MetricsService serverStarted time.Time @@ -117,6 +118,94 @@ func (h MetricsHandler) GetMetrics(w http.ResponseWriter, r *http.Request) { JSON(w, http.StatusOK, metricsResp) } +// GetPrometheusMetrics returns metrics in Prometheus exposition format (text/plain). +// GET /api/v1/metrics/prometheus +// Compatible with Prometheus, Grafana Agent, Datadog Agent, Victoria Metrics, and any +// OpenMetrics-compatible scraper. Metric names follow Prometheus naming conventions +// (lowercase, snake_case, prefixed with certctl_). +func (h MetricsHandler) GetPrometheusMetrics(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + Error(w, http.StatusMethodNotAllowed, "Method not allowed") + return + } + + requestID := middleware.GetRequestID(r.Context()) + + summary, err := h.svc.GetDashboardSummary(r.Context()) + if err != nil { + ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to collect metrics", requestID) + return + } + + // Extract fields from summary via JSON round-trip (avoids cross-package type assertion) + jsonBytes, err := json.Marshal(summary) + if err != nil { + ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to marshal metrics data", requestID) + return + } + var dashboardSummary DashboardSummary + if err := json.Unmarshal(jsonBytes, &dashboardSummary); err != nil { + ErrorWithRequestID(w, http.StatusInternalServerError, "Invalid metrics data", requestID) + return + } + + // Compute derived values + active := dashboardSummary.TotalCertificates - dashboardSummary.ExpiringCertificates - dashboardSummary.ExpiredCertificates - dashboardSummary.RevokedCertificates + uptimeSeconds := int64(time.Since(h.serverStarted).Seconds()) + + // Build Prometheus exposition format + // See: https://prometheus.io/docs/instrumenting/exposition_formats/ + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + w.WriteHeader(http.StatusOK) + + // Gauges — point-in-time values + fmt.Fprintf(w, "# HELP certctl_certificate_total Total number of managed certificates.\n") + fmt.Fprintf(w, "# TYPE certctl_certificate_total gauge\n") + fmt.Fprintf(w, "certctl_certificate_total %d\n\n", dashboardSummary.TotalCertificates) + + fmt.Fprintf(w, "# HELP certctl_certificate_active Number of active (non-expiring, non-expired, non-revoked) certificates.\n") + fmt.Fprintf(w, "# TYPE certctl_certificate_active gauge\n") + fmt.Fprintf(w, "certctl_certificate_active %d\n\n", active) + + fmt.Fprintf(w, "# HELP certctl_certificate_expiring_soon Number of certificates expiring within 30 days.\n") + fmt.Fprintf(w, "# TYPE certctl_certificate_expiring_soon gauge\n") + fmt.Fprintf(w, "certctl_certificate_expiring_soon %d\n\n", dashboardSummary.ExpiringCertificates) + + fmt.Fprintf(w, "# HELP certctl_certificate_expired Number of expired certificates.\n") + fmt.Fprintf(w, "# TYPE certctl_certificate_expired gauge\n") + fmt.Fprintf(w, "certctl_certificate_expired %d\n\n", dashboardSummary.ExpiredCertificates) + + fmt.Fprintf(w, "# HELP certctl_certificate_revoked Number of revoked certificates.\n") + fmt.Fprintf(w, "# TYPE certctl_certificate_revoked gauge\n") + fmt.Fprintf(w, "certctl_certificate_revoked %d\n\n", dashboardSummary.RevokedCertificates) + + fmt.Fprintf(w, "# HELP certctl_agent_total Total number of registered agents.\n") + fmt.Fprintf(w, "# TYPE certctl_agent_total gauge\n") + fmt.Fprintf(w, "certctl_agent_total %d\n\n", dashboardSummary.TotalAgents) + + fmt.Fprintf(w, "# HELP certctl_agent_online Number of agents currently online.\n") + fmt.Fprintf(w, "# TYPE certctl_agent_online gauge\n") + fmt.Fprintf(w, "certctl_agent_online %d\n\n", dashboardSummary.ActiveAgents) + + fmt.Fprintf(w, "# HELP certctl_job_pending Number of jobs currently pending.\n") + fmt.Fprintf(w, "# TYPE certctl_job_pending gauge\n") + fmt.Fprintf(w, "certctl_job_pending %d\n\n", dashboardSummary.PendingJobs) + + // Counters — cumulative values + fmt.Fprintf(w, "# HELP certctl_job_completed_total Total number of completed jobs.\n") + fmt.Fprintf(w, "# TYPE certctl_job_completed_total counter\n") + fmt.Fprintf(w, "certctl_job_completed_total %d\n\n", dashboardSummary.CompleteJobs) + + fmt.Fprintf(w, "# HELP certctl_job_failed_total Total number of failed jobs.\n") + fmt.Fprintf(w, "# TYPE certctl_job_failed_total counter\n") + fmt.Fprintf(w, "certctl_job_failed_total %d\n\n", dashboardSummary.FailedJobs) + + // Info — server uptime + fmt.Fprintf(w, "# HELP certctl_uptime_seconds Server uptime in seconds.\n") + fmt.Fprintf(w, "# TYPE certctl_uptime_seconds gauge\n") + fmt.Fprintf(w, "certctl_uptime_seconds %d\n", uptimeSeconds) +} + // DashboardSummary mirrors the service.DashboardSummary for JSON unmarshaling. // JSON tags must match the service-layer struct exactly. type DashboardSummary struct { diff --git a/internal/api/handler/network_scan.go b/internal/api/handler/network_scan.go new file mode 100644 index 0000000..a4390d8 --- /dev/null +++ b/internal/api/handler/network_scan.go @@ -0,0 +1,179 @@ +package handler + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + + "github.com/shankar0123/certctl/internal/domain" +) + +// NetworkScanService defines the interface used by the network scan handler. +type NetworkScanService interface { + ListTargets(ctx context.Context) ([]*domain.NetworkScanTarget, error) + GetTarget(ctx context.Context, id string) (*domain.NetworkScanTarget, error) + CreateTarget(ctx context.Context, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) + UpdateTarget(ctx context.Context, id string, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) + DeleteTarget(ctx context.Context, id string) error + TriggerScan(ctx context.Context, targetID string) (*domain.DiscoveryScan, error) +} + +// NetworkScanHandler handles HTTP requests for network scan targets. +type NetworkScanHandler struct { + svc NetworkScanService +} + +// NewNetworkScanHandler creates a new network scan handler. +func NewNetworkScanHandler(svc NetworkScanService) NetworkScanHandler { + return NetworkScanHandler{svc: svc} +} + +// ListNetworkScanTargets handles GET /api/v1/network-scan-targets +func (h NetworkScanHandler) ListNetworkScanTargets(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + Error(w, http.StatusMethodNotAllowed, "Method not allowed") + return + } + + targets, err := h.svc.ListTargets(r.Context()) + if err != nil { + Error(w, http.StatusInternalServerError, fmt.Sprintf("failed to list network scan targets: %v", err)) + return + } + + if targets == nil { + targets = []*domain.NetworkScanTarget{} + } + + JSON(w, http.StatusOK, PagedResponse{ + Data: targets, + Total: int64(len(targets)), + Page: 1, + PerPage: len(targets), + }) +} + +// GetNetworkScanTarget handles GET /api/v1/network-scan-targets/{id} +func (h NetworkScanHandler) GetNetworkScanTarget(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + Error(w, http.StatusMethodNotAllowed, "Method not allowed") + return + } + + id := r.PathValue("id") + if id == "" { + Error(w, http.StatusBadRequest, "network scan target ID is required") + return + } + + target, err := h.svc.GetTarget(r.Context(), id) + if err != nil { + Error(w, http.StatusNotFound, fmt.Sprintf("network scan target not found: %v", err)) + return + } + + JSON(w, http.StatusOK, target) +} + +// CreateNetworkScanTarget handles POST /api/v1/network-scan-targets +func (h NetworkScanHandler) CreateNetworkScanTarget(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + Error(w, http.StatusMethodNotAllowed, "Method not allowed") + return + } + + var target domain.NetworkScanTarget + if err := json.NewDecoder(r.Body).Decode(&target); err != nil { + Error(w, http.StatusBadRequest, fmt.Sprintf("invalid request body: %v", err)) + return + } + + created, err := h.svc.CreateTarget(r.Context(), &target) + if err != nil { + Error(w, http.StatusBadRequest, fmt.Sprintf("failed to create network scan target: %v", err)) + return + } + + JSON(w, http.StatusCreated, created) +} + +// UpdateNetworkScanTarget handles PUT /api/v1/network-scan-targets/{id} +func (h NetworkScanHandler) UpdateNetworkScanTarget(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPut { + Error(w, http.StatusMethodNotAllowed, "Method not allowed") + return + } + + id := r.PathValue("id") + if id == "" { + Error(w, http.StatusBadRequest, "network scan target ID is required") + return + } + + var target domain.NetworkScanTarget + if err := json.NewDecoder(r.Body).Decode(&target); err != nil { + Error(w, http.StatusBadRequest, fmt.Sprintf("invalid request body: %v", err)) + return + } + + updated, err := h.svc.UpdateTarget(r.Context(), id, &target) + if err != nil { + Error(w, http.StatusInternalServerError, fmt.Sprintf("failed to update network scan target: %v", err)) + return + } + + JSON(w, http.StatusOK, updated) +} + +// DeleteNetworkScanTarget handles DELETE /api/v1/network-scan-targets/{id} +func (h NetworkScanHandler) DeleteNetworkScanTarget(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodDelete { + Error(w, http.StatusMethodNotAllowed, "Method not allowed") + return + } + + id := r.PathValue("id") + if id == "" { + Error(w, http.StatusBadRequest, "network scan target ID is required") + return + } + + if err := h.svc.DeleteTarget(r.Context(), id); err != nil { + Error(w, http.StatusNotFound, fmt.Sprintf("failed to delete network scan target: %v", err)) + return + } + + JSON(w, http.StatusNoContent, nil) +} + +// TriggerNetworkScan handles POST /api/v1/network-scan-targets/{id}/scan +func (h NetworkScanHandler) TriggerNetworkScan(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + Error(w, http.StatusMethodNotAllowed, "Method not allowed") + return + } + + id := r.PathValue("id") + if id == "" { + Error(w, http.StatusBadRequest, "network scan target ID is required") + return + } + + scan, err := h.svc.TriggerScan(r.Context(), id) + if err != nil { + Error(w, http.StatusInternalServerError, fmt.Sprintf("failed to trigger scan: %v", err)) + return + } + + // scan may be nil if no certs found + if scan == nil { + JSON(w, http.StatusOK, map[string]string{ + "status": "completed", + "message": "Scan completed, no certificates found", + }) + return + } + + JSON(w, http.StatusAccepted, scan) +} diff --git a/internal/api/handler/network_scan_handler_test.go b/internal/api/handler/network_scan_handler_test.go new file mode 100644 index 0000000..6d93782 --- /dev/null +++ b/internal/api/handler/network_scan_handler_test.go @@ -0,0 +1,220 @@ +package handler + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/shankar0123/certctl/internal/domain" +) + +// mockNetworkScanService implements NetworkScanService for testing. +type mockNetworkScanService struct { + targets []*domain.NetworkScanTarget +} + +func (m *mockNetworkScanService) ListTargets(ctx context.Context) ([]*domain.NetworkScanTarget, error) { + return m.targets, nil +} + +func (m *mockNetworkScanService) GetTarget(ctx context.Context, id string) (*domain.NetworkScanTarget, error) { + for _, t := range m.targets { + if t.ID == id { + return t, nil + } + } + return nil, fmt.Errorf("not found: %s", id) +} + +func (m *mockNetworkScanService) CreateTarget(ctx context.Context, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) { + if target.Name == "" { + return nil, fmt.Errorf("name is required") + } + target.ID = "nst-test-123" + m.targets = append(m.targets, target) + return target, nil +} + +func (m *mockNetworkScanService) UpdateTarget(ctx context.Context, id string, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) { + for _, t := range m.targets { + if t.ID == id { + if target.Name != "" { + t.Name = target.Name + } + return t, nil + } + } + return nil, fmt.Errorf("not found: %s", id) +} + +func (m *mockNetworkScanService) DeleteTarget(ctx context.Context, id string) error { + for i, t := range m.targets { + if t.ID == id { + m.targets = append(m.targets[:i], m.targets[i+1:]...) + return nil + } + } + return fmt.Errorf("not found: %s", id) +} + +func (m *mockNetworkScanService) TriggerScan(ctx context.Context, targetID string) (*domain.DiscoveryScan, error) { + for _, t := range m.targets { + if t.ID == targetID { + return &domain.DiscoveryScan{ + ID: "dscan-test", + AgentID: "server-scanner", + CertificatesFound: 3, + }, nil + } + } + return nil, fmt.Errorf("not found: %s", targetID) +} + +func TestListNetworkScanTargets(t *testing.T) { + svc := &mockNetworkScanService{ + targets: []*domain.NetworkScanTarget{ + {ID: "nst-1", Name: "target1", CIDRs: []string{"10.0.0.0/24"}, Ports: []int{443}}, + {ID: "nst-2", Name: "target2", CIDRs: []string{"192.168.0.0/16"}, Ports: []int{443, 8443}}, + }, + } + h := NewNetworkScanHandler(svc) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/network-scan-targets", nil) + w := httptest.NewRecorder() + h.ListNetworkScanTargets(w, req) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + + var resp PagedResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Total != 2 { + t.Errorf("expected total 2, got %d", resp.Total) + } +} + +func TestListNetworkScanTargets_Empty(t *testing.T) { + svc := &mockNetworkScanService{} + h := NewNetworkScanHandler(svc) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/network-scan-targets", nil) + w := httptest.NewRecorder() + h.ListNetworkScanTargets(w, req) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } +} + +func TestCreateNetworkScanTarget(t *testing.T) { + svc := &mockNetworkScanService{} + h := NewNetworkScanHandler(svc) + + body, _ := json.Marshal(map[string]interface{}{ + "name": "Production", + "cidrs": []string{"10.0.0.0/24"}, + "ports": []int{443}, + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/network-scan-targets", bytes.NewReader(body)) + w := httptest.NewRecorder() + h.CreateNetworkScanTarget(w, req) + + if w.Code != http.StatusCreated { + t.Errorf("expected 201, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestCreateNetworkScanTarget_InvalidJSON(t *testing.T) { + svc := &mockNetworkScanService{} + h := NewNetworkScanHandler(svc) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/network-scan-targets", bytes.NewReader([]byte("not json"))) + w := httptest.NewRecorder() + h.CreateNetworkScanTarget(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestCreateNetworkScanTarget_MissingName(t *testing.T) { + svc := &mockNetworkScanService{} + h := NewNetworkScanHandler(svc) + + body, _ := json.Marshal(map[string]interface{}{ + "cidrs": []string{"10.0.0.0/24"}, + }) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/network-scan-targets", bytes.NewReader(body)) + w := httptest.NewRecorder() + h.CreateNetworkScanTarget(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestDeleteNetworkScanTarget_NotFound(t *testing.T) { + svc := &mockNetworkScanService{} + h := NewNetworkScanHandler(svc) + + req := httptest.NewRequest(http.MethodDelete, "/api/v1/network-scan-targets/nst-nonexistent", nil) + req.SetPathValue("id", "nst-nonexistent") + w := httptest.NewRecorder() + h.DeleteNetworkScanTarget(w, req) + + if w.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d", w.Code) + } +} + +func TestTriggerNetworkScan(t *testing.T) { + svc := &mockNetworkScanService{ + targets: []*domain.NetworkScanTarget{ + {ID: "nst-1", Name: "target1"}, + }, + } + h := NewNetworkScanHandler(svc) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/network-scan-targets/nst-1/scan", nil) + req.SetPathValue("id", "nst-1") + w := httptest.NewRecorder() + h.TriggerNetworkScan(w, req) + + if w.Code != http.StatusAccepted { + t.Errorf("expected 202, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestTriggerNetworkScan_NotFound(t *testing.T) { + svc := &mockNetworkScanService{} + h := NewNetworkScanHandler(svc) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/network-scan-targets/nst-nonexistent/scan", nil) + req.SetPathValue("id", "nst-nonexistent") + w := httptest.NewRecorder() + h.TriggerNetworkScan(w, req) + + if w.Code != http.StatusInternalServerError { + t.Errorf("expected 500, got %d", w.Code) + } +} + +func TestListNetworkScanTargets_MethodNotAllowed(t *testing.T) { + svc := &mockNetworkScanService{} + h := NewNetworkScanHandler(svc) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/network-scan-targets", nil) + w := httptest.NewRecorder() + h.ListNetworkScanTargets(w, req) + + if w.Code != http.StatusMethodNotAllowed { + t.Errorf("expected 405, got %d", w.Code) + } +} diff --git a/internal/api/handler/stats_handler_test.go b/internal/api/handler/stats_handler_test.go index 40b07c4..45d0bef 100644 --- a/internal/api/handler/stats_handler_test.go +++ b/internal/api/handler/stats_handler_test.go @@ -5,6 +5,7 @@ import ( "fmt" "net/http" "net/http/httptest" + "strings" "testing" "time" ) @@ -202,3 +203,116 @@ func TestGetMetrics_ServiceError(t *testing.T) { t.Errorf("expected 500, got %d", w.Code) } } + +// --- Prometheus metrics endpoint tests --- + +func TestGetPrometheusMetrics_Success(t *testing.T) { + mock := &MockStatsService{ + GetDashboardSummaryFn: func(ctx context.Context) (interface{}, error) { + return &DashboardSummary{ + TotalCertificates: 25, + ExpiringCertificates: 3, + ExpiredCertificates: 2, + RevokedCertificates: 1, + ActiveAgents: 4, + TotalAgents: 6, + PendingJobs: 2, + FailedJobs: 1, + CompleteJobs: 15, + }, nil + }, + } + h := NewMetricsHandler(mock, time.Now().Add(-1*time.Hour)) + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/prometheus", nil) + w := httptest.NewRecorder() + h.GetPrometheusMetrics(w, req) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + + contentType := w.Header().Get("Content-Type") + if contentType != "text/plain; version=0.0.4; charset=utf-8" { + t.Errorf("expected Prometheus content type, got %q", contentType) + } + + body := w.Body.String() + + // Check metric lines are present + expected := []string{ + "certctl_certificate_total 25", + "certctl_certificate_active 19", + "certctl_certificate_expiring_soon 3", + "certctl_certificate_expired 2", + "certctl_certificate_revoked 1", + "certctl_agent_total 6", + "certctl_agent_online 4", + "certctl_job_pending 2", + "certctl_job_completed_total 15", + "certctl_job_failed_total 1", + "# TYPE certctl_certificate_total gauge", + "# TYPE certctl_job_completed_total counter", + "# HELP certctl_uptime_seconds", + "# TYPE certctl_uptime_seconds gauge", + } + for _, exp := range expected { + if !containsLine(body, exp) { + t.Errorf("expected body to contain %q", exp) + } + } +} + +func TestGetPrometheusMetrics_MethodNotAllowed(t *testing.T) { + mock := &MockStatsService{} + h := NewMetricsHandler(mock, time.Now()) + req := httptest.NewRequest(http.MethodPost, "/api/v1/metrics/prometheus", nil) + w := httptest.NewRecorder() + h.GetPrometheusMetrics(w, req) + if w.Code != http.StatusMethodNotAllowed { + t.Errorf("expected 405, got %d", w.Code) + } +} + +func TestGetPrometheusMetrics_ServiceError(t *testing.T) { + mock := &MockStatsService{ + GetDashboardSummaryFn: func(ctx context.Context) (interface{}, error) { + return nil, fmt.Errorf("db error") + }, + } + h := NewMetricsHandler(mock, time.Now()) + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/prometheus", nil) + w := httptest.NewRecorder() + h.GetPrometheusMetrics(w, req) + if w.Code != http.StatusInternalServerError { + t.Errorf("expected 500, got %d", w.Code) + } +} + +func TestGetPrometheusMetrics_ZeroValues(t *testing.T) { + mock := &MockStatsService{ + GetDashboardSummaryFn: func(ctx context.Context) (interface{}, error) { + return &DashboardSummary{}, nil + }, + } + h := NewMetricsHandler(mock, time.Now()) + req := httptest.NewRequest(http.MethodGet, "/api/v1/metrics/prometheus", nil) + w := httptest.NewRecorder() + h.GetPrometheusMetrics(w, req) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + + body := w.Body.String() + if !containsLine(body, "certctl_certificate_total 0") { + t.Error("expected zero value for certificate_total") + } + if !containsLine(body, "certctl_job_pending 0") { + t.Error("expected zero value for job_pending") + } +} + +// containsLine checks if the text contains the given substring. +func containsLine(text, substr string) bool { + return strings.Contains(text, substr) +} diff --git a/internal/api/router/router.go b/internal/api/router/router.go index 4b2d6c6..0ce87fd 100644 --- a/internal/api/router/router.go +++ b/internal/api/router/router.go @@ -61,6 +61,7 @@ func (r *Router) RegisterHandlers( metrics handler.MetricsHandler, health handler.HealthHandler, discovery handler.DiscoveryHandler, + networkScan handler.NetworkScanHandler, ) { // Health endpoints (no auth middleware — must always be accessible) r.mux.Handle("GET /health", middleware.Chain( @@ -188,6 +189,7 @@ func (r *Router) RegisterHandlers( // Metrics routes: /api/v1/metrics r.Register("GET /api/v1/metrics", http.HandlerFunc(metrics.GetMetrics)) + r.Register("GET /api/v1/metrics/prometheus", http.HandlerFunc(metrics.GetPrometheusMetrics)) // Discovery routes: /api/v1/discovered-certificates, /api/v1/discovery-scans r.Register("POST /api/v1/agents/{id}/discoveries", http.HandlerFunc(discovery.SubmitDiscoveryReport)) @@ -197,6 +199,14 @@ func (r *Router) RegisterHandlers( r.Register("POST /api/v1/discovered-certificates/{id}/dismiss", http.HandlerFunc(discovery.DismissDiscovered)) r.Register("GET /api/v1/discovery-scans", http.HandlerFunc(discovery.ListScans)) r.Register("GET /api/v1/discovery-summary", http.HandlerFunc(discovery.GetDiscoverySummary)) + + // Network scan routes: /api/v1/network-scan-targets + r.Register("GET /api/v1/network-scan-targets", http.HandlerFunc(networkScan.ListNetworkScanTargets)) + r.Register("POST /api/v1/network-scan-targets", http.HandlerFunc(networkScan.CreateNetworkScanTarget)) + r.Register("GET /api/v1/network-scan-targets/{id}", http.HandlerFunc(networkScan.GetNetworkScanTarget)) + r.Register("PUT /api/v1/network-scan-targets/{id}", http.HandlerFunc(networkScan.UpdateNetworkScanTarget)) + r.Register("DELETE /api/v1/network-scan-targets/{id}", http.HandlerFunc(networkScan.DeleteNetworkScanTarget)) + r.Register("POST /api/v1/network-scan-targets/{id}/scan", http.HandlerFunc(networkScan.TriggerNetworkScan)) } // GetMux returns the underlying http.ServeMux for direct access if needed. diff --git a/internal/config/config.go b/internal/config/config.go index d0a4c2d..ba1e74b 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,16 +11,17 @@ import ( // Config represents the complete application configuration. // All configuration values are read from environment variables with CERTCTL_ prefix. type Config struct { - Server ServerConfig - Database DatabaseConfig - Scheduler SchedulerConfig - Log LogConfig - Auth AuthConfig - RateLimit RateLimitConfig - CORS CORSConfig - Keygen KeygenConfig - CA CAConfig - Notifiers NotifierConfig + Server ServerConfig + Database DatabaseConfig + Scheduler SchedulerConfig + Log LogConfig + Auth AuthConfig + RateLimit RateLimitConfig + CORS CORSConfig + Keygen KeygenConfig + CA CAConfig + Notifiers NotifierConfig + NetworkScan NetworkScanConfig } // NotifierConfig contains configuration for notification connectors. @@ -80,6 +81,12 @@ type OpenSSLConfig struct { TimeoutSeconds int } +// NetworkScanConfig controls the server-side active TLS scanner. +type NetworkScanConfig struct { + Enabled bool // Enable network scanning (default false) + ScanInterval time.Duration // How often to run network scans (default 6h) +} + // ServerConfig contains HTTP server configuration. type ServerConfig struct { Host string @@ -178,6 +185,10 @@ func Load() (*Config, error) { OpsGenieAPIKey: getEnv("CERTCTL_OPSGENIE_API_KEY", ""), OpsGeniePriority: getEnv("CERTCTL_OPSGENIE_PRIORITY", "P3"), }, + NetworkScan: NetworkScanConfig{ + Enabled: getEnvBool("CERTCTL_NETWORK_SCAN_ENABLED", false), + ScanInterval: getEnvDuration("CERTCTL_NETWORK_SCAN_INTERVAL", 6*time.Hour), + }, } if err := cfg.Validate(); err != nil { diff --git a/internal/domain/network_scan.go b/internal/domain/network_scan.go new file mode 100644 index 0000000..9ffcd99 --- /dev/null +++ b/internal/domain/network_scan.go @@ -0,0 +1,27 @@ +package domain + +import "time" + +// NetworkScanTarget defines a network range to scan for TLS certificates. +type NetworkScanTarget struct { + ID string `json:"id"` + Name string `json:"name"` + CIDRs []string `json:"cidrs"` + Ports []int `json:"ports"` + Enabled bool `json:"enabled"` + ScanIntervalHours int `json:"scan_interval_hours"` + TimeoutMs int `json:"timeout_ms"` + LastScanAt *time.Time `json:"last_scan_at,omitempty"` + LastScanDurationMs *int `json:"last_scan_duration_ms,omitempty"` + LastScanCertsFound *int `json:"last_scan_certs_found,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +// NetworkScanResult holds the outcome of scanning a single endpoint. +type NetworkScanResult struct { + Address string // "ip:port" + Certs []DiscoveredCertEntry + Error string + LatencyMs int +} diff --git a/internal/domain/network_scan_test.go b/internal/domain/network_scan_test.go new file mode 100644 index 0000000..babe285 --- /dev/null +++ b/internal/domain/network_scan_test.go @@ -0,0 +1,67 @@ +package domain + +import ( + "testing" + "time" +) + +func TestNetworkScanTarget_Defaults(t *testing.T) { + target := NetworkScanTarget{ + ID: "nst-test", + Name: "Test Target", + CIDRs: []string{"10.0.0.0/24"}, + Ports: []int{443}, + Enabled: true, + ScanIntervalHours: 6, + TimeoutMs: 5000, + } + + if target.ID != "nst-test" { + t.Errorf("expected ID nst-test, got %s", target.ID) + } + if len(target.CIDRs) != 1 || target.CIDRs[0] != "10.0.0.0/24" { + t.Errorf("unexpected CIDRs: %v", target.CIDRs) + } + if target.LastScanAt != nil { + t.Error("expected nil LastScanAt for new target") + } +} + +func TestNetworkScanTarget_WithScanResults(t *testing.T) { + now := time.Now() + duration := 1500 + found := 12 + target := NetworkScanTarget{ + ID: "nst-prod", + Name: "Production Network", + CIDRs: []string{"192.168.1.0/24", "10.0.0.0/16"}, + Ports: []int{443, 8443, 636}, + Enabled: true, + ScanIntervalHours: 1, + TimeoutMs: 3000, + LastScanAt: &now, + LastScanDurationMs: &duration, + LastScanCertsFound: &found, + } + + if len(target.Ports) != 3 { + t.Errorf("expected 3 ports, got %d", len(target.Ports)) + } + if *target.LastScanCertsFound != 12 { + t.Errorf("expected 12 certs found, got %d", *target.LastScanCertsFound) + } +} + +func TestNetworkScanResult_Fields(t *testing.T) { + result := NetworkScanResult{ + Address: "192.168.1.1:443", + Error: "", + LatencyMs: 45, + } + if result.Address != "192.168.1.1:443" { + t.Errorf("expected address 192.168.1.1:443, got %s", result.Address) + } + if result.LatencyMs != 45 { + t.Errorf("expected latency 45ms, got %d", result.LatencyMs) + } +} diff --git a/internal/integration/lifecycle_test.go b/internal/integration/lifecycle_test.go index 49e9447..9dbb1dd 100644 --- a/internal/integration/lifecycle_test.go +++ b/internal/integration/lifecycle_test.go @@ -80,6 +80,7 @@ func TestCertificateLifecycle(t *testing.T) { metricsHandler := handler.NewMetricsHandler(&mockStatsService{}, time.Now()) healthHandler := handler.NewHealthHandler("none") discoveryHandler := handler.NewDiscoveryHandler(&mockDiscoveryService{}) + networkScanHandler := handler.NewNetworkScanHandler(&mockNetworkScanService{}) // Create router and register handlers r := router.New() @@ -100,6 +101,7 @@ func TestCertificateLifecycle(t *testing.T) { metricsHandler, healthHandler, discoveryHandler, + networkScanHandler, ) // Create test server @@ -1174,3 +1176,30 @@ func (m *mockDiscoveryService) GetScan(ctx context.Context, id string) (*domain. func (m *mockDiscoveryService) GetDiscoverySummary(ctx context.Context) (map[string]int, error) { return map[string]int{}, nil } + +// mockNetworkScanService implements handler.NetworkScanService for integration tests. +type mockNetworkScanService struct{} + +func (m *mockNetworkScanService) ListTargets(ctx context.Context) ([]*domain.NetworkScanTarget, error) { + return nil, nil +} + +func (m *mockNetworkScanService) GetTarget(ctx context.Context, id string) (*domain.NetworkScanTarget, error) { + return nil, fmt.Errorf("not found") +} + +func (m *mockNetworkScanService) CreateTarget(ctx context.Context, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) { + return target, nil +} + +func (m *mockNetworkScanService) UpdateTarget(ctx context.Context, id string, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) { + return target, nil +} + +func (m *mockNetworkScanService) DeleteTarget(ctx context.Context, id string) error { + return nil +} + +func (m *mockNetworkScanService) TriggerScan(ctx context.Context, targetID string) (*domain.DiscoveryScan, error) { + return nil, nil +} diff --git a/internal/integration/negative_test.go b/internal/integration/negative_test.go index 3d9b501..35cd9a3 100644 --- a/internal/integration/negative_test.go +++ b/internal/integration/negative_test.go @@ -73,6 +73,7 @@ func setupTestServer(t *testing.T) (*httptest.Server, *mockCertificateRepository metricsHandler := handler.NewMetricsHandler(&mockStatsService{}, time.Now()) healthHandler := handler.NewHealthHandler("none") discoveryHandler := handler.NewDiscoveryHandler(&mockDiscoveryService{}) + networkScanHandler := handler.NewNetworkScanHandler(&mockNetworkScanService{}) r := router.New() r.RegisterHandlers( @@ -92,6 +93,7 @@ func setupTestServer(t *testing.T) (*httptest.Server, *mockCertificateRepository metricsHandler, healthHandler, discoveryHandler, + networkScanHandler, ) server := httptest.NewServer(r) @@ -796,3 +798,5 @@ func TestRevocationEndpoints(t *testing.T) { } }) } + +// mockNetworkScanService is defined in lifecycle_test.go (same package) diff --git a/internal/repository/interfaces.go b/internal/repository/interfaces.go index 29cd00a..44c9151 100644 --- a/internal/repository/interfaces.go +++ b/internal/repository/interfaces.go @@ -238,6 +238,24 @@ type DiscoveryFilter struct { PerPage int } +// NetworkScanRepository defines operations for managing network scan targets. +type NetworkScanRepository interface { + // List returns all network scan targets. + List(ctx context.Context) ([]*domain.NetworkScanTarget, error) + // ListEnabled returns only enabled scan targets. + ListEnabled(ctx context.Context) ([]*domain.NetworkScanTarget, error) + // Get retrieves a network scan target by ID. + Get(ctx context.Context, id string) (*domain.NetworkScanTarget, error) + // Create stores a new network scan target. + Create(ctx context.Context, target *domain.NetworkScanTarget) error + // Update modifies an existing network scan target. + Update(ctx context.Context, target *domain.NetworkScanTarget) error + // Delete removes a network scan target. + Delete(ctx context.Context, id string) error + // UpdateScanResults records the outcome of the last scan for a target. + UpdateScanResults(ctx context.Context, id string, scanAt time.Time, durationMs int, certsFound int) error +} + // OwnerRepository defines operations for managing certificate owners. type OwnerRepository interface { // List returns all owners. diff --git a/internal/repository/postgres/network_scan.go b/internal/repository/postgres/network_scan.go new file mode 100644 index 0000000..93781c1 --- /dev/null +++ b/internal/repository/postgres/network_scan.go @@ -0,0 +1,181 @@ +package postgres + +import ( + "context" + "database/sql" + "fmt" + "time" + + "github.com/lib/pq" + "github.com/shankar0123/certctl/internal/domain" +) + +// NetworkScanRepository implements repository.NetworkScanRepository using PostgreSQL. +type NetworkScanRepository struct { + db *sql.DB +} + +// NewNetworkScanRepository creates a new PostgreSQL-backed network scan repository. +func NewNetworkScanRepository(db *sql.DB) *NetworkScanRepository { + return &NetworkScanRepository{db: db} +} + +// List returns all network scan targets. +func (r *NetworkScanRepository) List(ctx context.Context) ([]*domain.NetworkScanTarget, error) { + rows, err := r.db.QueryContext(ctx, ` + SELECT id, name, cidrs, ports, enabled, scan_interval_hours, timeout_ms, + last_scan_at, last_scan_duration_ms, last_scan_certs_found, + created_at, updated_at + FROM network_scan_targets + ORDER BY created_at DESC`) + if err != nil { + return nil, fmt.Errorf("list network scan targets: %w", err) + } + defer rows.Close() + return r.scanRows(rows) +} + +// ListEnabled returns only enabled scan targets. +func (r *NetworkScanRepository) ListEnabled(ctx context.Context) ([]*domain.NetworkScanTarget, error) { + rows, err := r.db.QueryContext(ctx, ` + SELECT id, name, cidrs, ports, enabled, scan_interval_hours, timeout_ms, + last_scan_at, last_scan_duration_ms, last_scan_certs_found, + created_at, updated_at + FROM network_scan_targets + WHERE enabled = TRUE + ORDER BY created_at DESC`) + if err != nil { + return nil, fmt.Errorf("list enabled network scan targets: %w", err) + } + defer rows.Close() + return r.scanRows(rows) +} + +// Get retrieves a network scan target by ID. +func (r *NetworkScanRepository) Get(ctx context.Context, id string) (*domain.NetworkScanTarget, error) { + target := &domain.NetworkScanTarget{} + var lastScanAt sql.NullTime + var lastScanDurationMs, lastScanCertsFound sql.NullInt64 + err := r.db.QueryRowContext(ctx, ` + SELECT id, name, cidrs, ports, enabled, scan_interval_hours, timeout_ms, + last_scan_at, last_scan_duration_ms, last_scan_certs_found, + created_at, updated_at + FROM network_scan_targets + WHERE id = $1`, id).Scan( + &target.ID, &target.Name, pq.Array(&target.CIDRs), pq.Array(&target.Ports), + &target.Enabled, &target.ScanIntervalHours, &target.TimeoutMs, + &lastScanAt, &lastScanDurationMs, &lastScanCertsFound, + &target.CreatedAt, &target.UpdatedAt, + ) + if err == sql.ErrNoRows { + return nil, fmt.Errorf("network scan target not found: %s", id) + } + if err != nil { + return nil, fmt.Errorf("get network scan target: %w", err) + } + if lastScanAt.Valid { + target.LastScanAt = &lastScanAt.Time + } + if lastScanDurationMs.Valid { + v := int(lastScanDurationMs.Int64) + target.LastScanDurationMs = &v + } + if lastScanCertsFound.Valid { + v := int(lastScanCertsFound.Int64) + target.LastScanCertsFound = &v + } + return target, nil +} + +// Create stores a new network scan target. +func (r *NetworkScanRepository) Create(ctx context.Context, target *domain.NetworkScanTarget) error { + _, err := r.db.ExecContext(ctx, ` + INSERT INTO network_scan_targets (id, name, cidrs, ports, enabled, scan_interval_hours, timeout_ms, created_at, updated_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, + target.ID, target.Name, pq.Array(target.CIDRs), pq.Array(target.Ports), + target.Enabled, target.ScanIntervalHours, target.TimeoutMs, + target.CreatedAt, target.UpdatedAt, + ) + if err != nil { + return fmt.Errorf("create network scan target: %w", err) + } + return nil +} + +// Update modifies an existing network scan target. +func (r *NetworkScanRepository) Update(ctx context.Context, target *domain.NetworkScanTarget) error { + result, err := r.db.ExecContext(ctx, ` + UPDATE network_scan_targets + SET name = $1, cidrs = $2, ports = $3, enabled = $4, scan_interval_hours = $5, timeout_ms = $6, updated_at = $7 + WHERE id = $8`, + target.Name, pq.Array(target.CIDRs), pq.Array(target.Ports), + target.Enabled, target.ScanIntervalHours, target.TimeoutMs, + time.Now(), target.ID, + ) + if err != nil { + return fmt.Errorf("update network scan target: %w", err) + } + rows, _ := result.RowsAffected() + if rows == 0 { + return fmt.Errorf("network scan target not found: %s", target.ID) + } + return nil +} + +// Delete removes a network scan target. +func (r *NetworkScanRepository) Delete(ctx context.Context, id string) error { + result, err := r.db.ExecContext(ctx, `DELETE FROM network_scan_targets WHERE id = $1`, id) + if err != nil { + return fmt.Errorf("delete network scan target: %w", err) + } + rows, _ := result.RowsAffected() + if rows == 0 { + return fmt.Errorf("network scan target not found: %s", id) + } + return nil +} + +// UpdateScanResults records the outcome of the last scan for a target. +func (r *NetworkScanRepository) UpdateScanResults(ctx context.Context, id string, scanAt time.Time, durationMs int, certsFound int) error { + _, err := r.db.ExecContext(ctx, ` + UPDATE network_scan_targets + SET last_scan_at = $1, last_scan_duration_ms = $2, last_scan_certs_found = $3, updated_at = $4 + WHERE id = $5`, + scanAt, durationMs, certsFound, time.Now(), id, + ) + if err != nil { + return fmt.Errorf("update scan results: %w", err) + } + return nil +} + +// scanRows scans multiple rows from a query result. +func (r *NetworkScanRepository) scanRows(rows *sql.Rows) ([]*domain.NetworkScanTarget, error) { + var targets []*domain.NetworkScanTarget + for rows.Next() { + target := &domain.NetworkScanTarget{} + var lastScanAt sql.NullTime + var lastScanDurationMs, lastScanCertsFound sql.NullInt64 + if err := rows.Scan( + &target.ID, &target.Name, pq.Array(&target.CIDRs), pq.Array(&target.Ports), + &target.Enabled, &target.ScanIntervalHours, &target.TimeoutMs, + &lastScanAt, &lastScanDurationMs, &lastScanCertsFound, + &target.CreatedAt, &target.UpdatedAt, + ); err != nil { + return nil, fmt.Errorf("scan network scan target row: %w", err) + } + if lastScanAt.Valid { + target.LastScanAt = &lastScanAt.Time + } + if lastScanDurationMs.Valid { + v := int(lastScanDurationMs.Int64) + target.LastScanDurationMs = &v + } + if lastScanCertsFound.Valid { + v := int(lastScanCertsFound.Int64) + target.LastScanCertsFound = &v + } + targets = append(targets, target) + } + return targets, rows.Err() +} diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 0ed898c..ea31046 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -16,6 +16,7 @@ type Scheduler struct { jobService *service.JobService agentService *service.AgentService notificationService *service.NotificationService + networkScanService *service.NetworkScanService logger *slog.Logger // Configurable tick intervals @@ -24,6 +25,7 @@ type Scheduler struct { agentHealthCheckInterval time.Duration notificationProcessInterval time.Duration shortLivedExpiryCheckInterval time.Duration + networkScanInterval time.Duration } // NewScheduler creates a new scheduler with configurable intervals. @@ -32,6 +34,7 @@ func NewScheduler( jobService *service.JobService, agentService *service.AgentService, notificationService *service.NotificationService, + networkScanService *service.NetworkScanService, logger *slog.Logger, ) *Scheduler { return &Scheduler{ @@ -39,6 +42,7 @@ func NewScheduler( jobService: jobService, agentService: agentService, notificationService: notificationService, + networkScanService: networkScanService, logger: logger, // Default intervals @@ -47,6 +51,7 @@ func NewScheduler( agentHealthCheckInterval: 2 * time.Minute, notificationProcessInterval: 1 * time.Minute, shortLivedExpiryCheckInterval: 30 * time.Second, + networkScanInterval: 6 * time.Hour, } } @@ -70,6 +75,11 @@ func (s *Scheduler) SetNotificationProcessInterval(d time.Duration) { s.notificationProcessInterval = d } +// SetNetworkScanInterval configures the interval for network scanning. +func (s *Scheduler) SetNetworkScanInterval(d time.Duration) { + s.networkScanInterval = d +} + // Start initiates all background scheduler loops. It returns a channel that signals // when the scheduler has started all loops. The scheduler runs until the context is cancelled. func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { @@ -90,6 +100,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} { go s.agentHealthCheckLoop(ctx) go s.notificationProcessLoop(ctx) go s.shortLivedExpiryCheckLoop(ctx) + if s.networkScanService != nil { + go s.networkScanLoop(ctx) + } // Wait for context cancellation <-ctx.Done() @@ -258,3 +271,35 @@ func (s *Scheduler) runShortLivedExpiryCheck(ctx context.Context) { s.logger.Debug("short-lived expiry check completed") } } + +// networkScanLoop runs every networkScanInterval and performs active TLS scanning +// of configured network targets. +func (s *Scheduler) networkScanLoop(ctx context.Context) { + ticker := time.NewTicker(s.networkScanInterval) + defer ticker.Stop() + + // Run immediately on start + s.runNetworkScan(ctx) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + s.runNetworkScan(ctx) + } + } +} + +// runNetworkScan executes a single network scan cycle with error recovery. +func (s *Scheduler) runNetworkScan(ctx context.Context) { + opCtx, cancel := context.WithTimeout(ctx, 30*time.Minute) + defer cancel() + if err := s.networkScanService.ScanAllTargets(opCtx); err != nil { + s.logger.Error("network scan failed", + "error", err, + "interval", s.networkScanInterval.String()) + } else { + s.logger.Debug("network scan completed") + } +} diff --git a/internal/service/network_scan.go b/internal/service/network_scan.go new file mode 100644 index 0000000..566fc20 --- /dev/null +++ b/internal/service/network_scan.go @@ -0,0 +1,436 @@ +package service + +import ( + "context" + "crypto/ecdsa" + "crypto/rsa" + "crypto/sha256" + "crypto/tls" + "crypto/x509" + "encoding/pem" + "fmt" + "log/slog" + "net" + "sync" + "time" + + "github.com/shankar0123/certctl/internal/domain" + "github.com/shankar0123/certctl/internal/repository" +) + +// SentinelAgentID is the agent ID used for network-discovered certificates. +// This allows the existing discovery dedup constraint (fingerprint, agent_id, source_path) +// to work without schema changes. +const SentinelAgentID = "server-scanner" + +// NetworkScanService manages active TLS scanning of network endpoints. +type NetworkScanService struct { + networkScanRepo repository.NetworkScanRepository + discoveryService *DiscoveryService + auditService *AuditService + logger *slog.Logger + concurrency int +} + +// NewNetworkScanService creates a new network scan service. +func NewNetworkScanService( + networkScanRepo repository.NetworkScanRepository, + discoveryService *DiscoveryService, + auditService *AuditService, + logger *slog.Logger, +) *NetworkScanService { + return &NetworkScanService{ + networkScanRepo: networkScanRepo, + discoveryService: discoveryService, + auditService: auditService, + logger: logger, + concurrency: 50, + } +} + +// ListTargets returns all network scan targets. +func (s *NetworkScanService) ListTargets(ctx context.Context) ([]*domain.NetworkScanTarget, error) { + return s.networkScanRepo.List(ctx) +} + +// GetTarget retrieves a network scan target by ID. +func (s *NetworkScanService) GetTarget(ctx context.Context, id string) (*domain.NetworkScanTarget, error) { + return s.networkScanRepo.Get(ctx, id) +} + +// CreateTarget creates a new network scan target. +func (s *NetworkScanService) CreateTarget(ctx context.Context, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) { + if target.Name == "" { + return nil, fmt.Errorf("name is required") + } + if len(target.CIDRs) == 0 { + return nil, fmt.Errorf("at least one CIDR is required") + } + // Validate CIDRs + for _, cidr := range target.CIDRs { + if _, _, err := net.ParseCIDR(cidr); err != nil { + // Try parsing as plain IP + if ip := net.ParseIP(cidr); ip == nil { + return nil, fmt.Errorf("invalid CIDR or IP: %s", cidr) + } + } + } + if len(target.Ports) == 0 { + target.Ports = []int{443} + } + if target.ScanIntervalHours == 0 { + target.ScanIntervalHours = 6 + } + if target.TimeoutMs == 0 { + target.TimeoutMs = 5000 + } + target.ID = generateID("nst") + target.Enabled = true + target.CreatedAt = time.Now() + target.UpdatedAt = time.Now() + + if err := s.networkScanRepo.Create(ctx, target); err != nil { + return nil, err + } + + s.auditService.RecordEvent(ctx, "operator", domain.ActorTypeUser, + "network_scan_target_created", "network_scan_target", target.ID, + map[string]interface{}{ + "name": target.Name, + "cidrs": target.CIDRs, + "ports": target.Ports, + }) + + return target, nil +} + +// UpdateTarget updates an existing network scan target. +func (s *NetworkScanService) UpdateTarget(ctx context.Context, id string, target *domain.NetworkScanTarget) (*domain.NetworkScanTarget, error) { + existing, err := s.networkScanRepo.Get(ctx, id) + if err != nil { + return nil, err + } + + if target.Name != "" { + existing.Name = target.Name + } + if len(target.CIDRs) > 0 { + // Validate new CIDRs + for _, cidr := range target.CIDRs { + if _, _, err := net.ParseCIDR(cidr); err != nil { + if ip := net.ParseIP(cidr); ip == nil { + return nil, fmt.Errorf("invalid CIDR or IP: %s", cidr) + } + } + } + existing.CIDRs = target.CIDRs + } + if len(target.Ports) > 0 { + existing.Ports = target.Ports + } + if target.ScanIntervalHours > 0 { + existing.ScanIntervalHours = target.ScanIntervalHours + } + if target.TimeoutMs > 0 { + existing.TimeoutMs = target.TimeoutMs + } + // Always update enabled field (it's a boolean, so 0-value is meaningful) + existing.Enabled = target.Enabled + + if err := s.networkScanRepo.Update(ctx, existing); err != nil { + return nil, err + } + + return existing, nil +} + +// DeleteTarget removes a network scan target. +func (s *NetworkScanService) DeleteTarget(ctx context.Context, id string) error { + if err := s.networkScanRepo.Delete(ctx, id); err != nil { + return err + } + + s.auditService.RecordEvent(ctx, "operator", domain.ActorTypeUser, + "network_scan_target_deleted", "network_scan_target", id, nil) + + return nil +} + +// ScanAllTargets runs the active TLS scan for all enabled targets. +// This is called by the scheduler on the configured interval. +func (s *NetworkScanService) ScanAllTargets(ctx context.Context) error { + targets, err := s.networkScanRepo.ListEnabled(ctx) + if err != nil { + return fmt.Errorf("list enabled targets: %w", err) + } + + if len(targets) == 0 { + if s.logger != nil { + s.logger.Debug("no enabled network scan targets") + } + return nil + } + + if s.logger != nil { + s.logger.Info("starting network scan", "targets", len(targets)) + } + + for _, target := range targets { + if ctx.Err() != nil { + return ctx.Err() + } + s.scanTarget(ctx, target) + } + + return nil +} + +// TriggerScan runs an immediate scan for a specific target. +func (s *NetworkScanService) TriggerScan(ctx context.Context, targetID string) (*domain.DiscoveryScan, error) { + target, err := s.networkScanRepo.Get(ctx, targetID) + if err != nil { + return nil, err + } + return s.scanTarget(ctx, target), nil +} + +// scanTarget scans a single network target and feeds results into the discovery pipeline. +func (s *NetworkScanService) scanTarget(ctx context.Context, target *domain.NetworkScanTarget) *domain.DiscoveryScan { + startTime := time.Now() + if s.logger != nil { + s.logger.Info("scanning network target", + "target_id", target.ID, + "name", target.Name, + "cidrs", target.CIDRs, + "ports", target.Ports) + } + + // Expand CIDRs to individual IPs + endpoints := s.expandEndpoints(target.CIDRs, target.Ports) + if s.logger != nil { + s.logger.Debug("expanded endpoints", "count", len(endpoints)) + } + + // Scan endpoints concurrently + timeout := time.Duration(target.TimeoutMs) * time.Millisecond + results := s.scanEndpoints(ctx, endpoints, timeout) + + // Collect discovered cert entries + var entries []domain.DiscoveredCertEntry + var scanErrors []string + for _, result := range results { + if result.Error != "" { + // Only log connection errors at debug level (many hosts won't have TLS) + if s.logger != nil { + s.logger.Debug("scan endpoint error", + "address", result.Address, + "error", result.Error) + } + continue + } + entries = append(entries, result.Certs...) + } + + scanDuration := time.Since(startTime) + if s.logger != nil { + s.logger.Info("network target scan completed", + "target_id", target.ID, + "endpoints_scanned", len(endpoints), + "certificates_found", len(entries), + "errors", len(scanErrors), + "duration_ms", scanDuration.Milliseconds()) + } + + // Update scan results on target + s.networkScanRepo.UpdateScanResults(ctx, target.ID, time.Now(), + int(scanDuration.Milliseconds()), len(entries)) + + // Feed into discovery pipeline if we found certs + if len(entries) == 0 { + return nil + } + + // Build directories list from CIDRs for the scan record + dirs := make([]string, len(target.CIDRs)) + copy(dirs, target.CIDRs) + + report := &domain.DiscoveryReport{ + AgentID: SentinelAgentID, + Directories: dirs, + Certificates: entries, + Errors: scanErrors, + ScanDurationMs: int(scanDuration.Milliseconds()), + } + + scan, err := s.discoveryService.ProcessDiscoveryReport(ctx, report) + if err != nil { + if s.logger != nil { + s.logger.Error("failed to process network scan report", + "target_id", target.ID, + "error", err) + } + return nil + } + + return scan +} + +// expandEndpoints converts CIDR ranges and ports into a list of "ip:port" endpoints. +func (s *NetworkScanService) expandEndpoints(cidrs []string, ports []int) []string { + var endpoints []string + + for _, cidr := range cidrs { + ips := expandCIDR(cidr) + for _, ip := range ips { + for _, port := range ports { + endpoints = append(endpoints, fmt.Sprintf("%s:%d", ip, port)) + } + } + } + + return endpoints +} + +// expandCIDR expands a CIDR notation or single IP into a list of IPs. +// Limits expansion to /20 (4096 IPs) to prevent accidental huge scans. +func expandCIDR(cidr string) []string { + // Try as CIDR first + ip, ipNet, err := net.ParseCIDR(cidr) + if err != nil { + // Try as single IP + if singleIP := net.ParseIP(cidr); singleIP != nil { + return []string{singleIP.String()} + } + return nil + } + + // Count network size and cap at /20 + ones, bits := ipNet.Mask.Size() + hostBits := bits - ones + if hostBits > 12 { // More than 4096 hosts + return nil // Skip overly large networks + } + + var ips []string + for ip := ip.Mask(ipNet.Mask); ipNet.Contains(ip); incrementIP(ip) { + // Copy IP before appending (net.IP is a mutable slice) + ipCopy := make(net.IP, len(ip)) + copy(ipCopy, ip) + ips = append(ips, ipCopy.String()) + } + + // Remove network and broadcast for IPv4 /31 and larger + if len(ips) > 2 { + ips = ips[1 : len(ips)-1] + } + + return ips +} + +// incrementIP increments an IP address by one. +func incrementIP(ip net.IP) { + for j := len(ip) - 1; j >= 0; j-- { + ip[j]++ + if ip[j] > 0 { + break + } + } +} + +// scanEndpoints probes TLS endpoints concurrently and returns results. +func (s *NetworkScanService) scanEndpoints(ctx context.Context, endpoints []string, timeout time.Duration) []domain.NetworkScanResult { + results := make([]domain.NetworkScanResult, len(endpoints)) + sem := make(chan struct{}, s.concurrency) + var wg sync.WaitGroup + + for i, endpoint := range endpoints { + if ctx.Err() != nil { + break + } + wg.Add(1) + sem <- struct{}{} + go func(idx int, addr string) { + defer wg.Done() + defer func() { <-sem }() + results[idx] = s.probeTLS(ctx, addr, timeout) + }(i, endpoint) + } + wg.Wait() + return results +} + +// probeTLS connects to an endpoint, performs a TLS handshake, and extracts certificates. +func (s *NetworkScanService) probeTLS(ctx context.Context, address string, timeout time.Duration) domain.NetworkScanResult { + startTime := time.Now() + result := domain.NetworkScanResult{Address: address} + + dialer := &net.Dialer{Timeout: timeout} + conn, err := tls.DialWithDialer(dialer, "tcp", address, &tls.Config{ + InsecureSkipVerify: true, // We want to discover ALL certs, including self-signed + }) + if err != nil { + result.Error = err.Error() + result.LatencyMs = int(time.Since(startTime).Milliseconds()) + return result + } + defer conn.Close() + + result.LatencyMs = int(time.Since(startTime).Milliseconds()) + + // Extract certificates from TLS connection state + state := conn.ConnectionState() + for _, cert := range state.PeerCertificates { + entry := tlsCertToEntry(cert, address) + result.Certs = append(result.Certs, entry) + } + + return result +} + +// tlsCertToEntry converts an x509.Certificate from a TLS handshake into a DiscoveredCertEntry. +func tlsCertToEntry(cert *x509.Certificate, address string) domain.DiscoveredCertEntry { + // Compute SHA-256 fingerprint + fingerprintBytes := sha256.Sum256(cert.Raw) + fingerprint := fmt.Sprintf("%x", fingerprintBytes) + + // Encode as PEM + pemBlock := &pem.Block{Type: "CERTIFICATE", Bytes: cert.Raw} + pemData := string(pem.EncodeToMemory(pemBlock)) + + // Key algorithm and size + keyAlg, keySize := tlsCertKeyInfo(cert) + + return domain.DiscoveredCertEntry{ + FingerprintSHA256: fingerprint, + CommonName: cert.Subject.CommonName, + SANs: cert.DNSNames, + SerialNumber: cert.SerialNumber.Text(16), + IssuerDN: cert.Issuer.String(), + SubjectDN: cert.Subject.String(), + NotBefore: cert.NotBefore.UTC().Format(time.RFC3339), + NotAfter: cert.NotAfter.UTC().Format(time.RFC3339), + KeyAlgorithm: keyAlg, + KeySize: keySize, + IsCA: cert.IsCA, + PEMData: pemData, + SourcePath: address, + SourceFormat: "network", + } +} + +// tlsCertKeyInfo extracts key algorithm name and size from a certificate. +func tlsCertKeyInfo(cert *x509.Certificate) (string, int) { + switch pub := cert.PublicKey.(type) { + case *rsa.PublicKey: + return "RSA", pub.N.BitLen() + case *ecdsa.PublicKey: + return "ECDSA", pub.Curve.Params().BitSize + default: + switch cert.PublicKeyAlgorithm { + case x509.Ed25519: + return "Ed25519", 256 + default: + return cert.PublicKeyAlgorithm.String(), 0 + } + } +} diff --git a/internal/service/network_scan_test.go b/internal/service/network_scan_test.go new file mode 100644 index 0000000..26a62ad --- /dev/null +++ b/internal/service/network_scan_test.go @@ -0,0 +1,244 @@ +package service + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/shankar0123/certctl/internal/domain" +) + +// mockNetworkScanRepo for testing +type mockNetworkScanRepo struct { + targets []*domain.NetworkScanTarget +} + +func (m *mockNetworkScanRepo) List(ctx context.Context) ([]*domain.NetworkScanTarget, error) { + return m.targets, nil +} + +func (m *mockNetworkScanRepo) ListEnabled(ctx context.Context) ([]*domain.NetworkScanTarget, error) { + var enabled []*domain.NetworkScanTarget + for _, t := range m.targets { + if t.Enabled { + enabled = append(enabled, t) + } + } + return enabled, nil +} + +func (m *mockNetworkScanRepo) Get(ctx context.Context, id string) (*domain.NetworkScanTarget, error) { + for _, t := range m.targets { + if t.ID == id { + return t, nil + } + } + return nil, fmt.Errorf("not found: %s", id) +} + +func (m *mockNetworkScanRepo) Create(ctx context.Context, target *domain.NetworkScanTarget) error { + m.targets = append(m.targets, target) + return nil +} + +func (m *mockNetworkScanRepo) Update(ctx context.Context, target *domain.NetworkScanTarget) error { + for i, t := range m.targets { + if t.ID == target.ID { + m.targets[i] = target + return nil + } + } + return fmt.Errorf("not found: %s", target.ID) +} + +func (m *mockNetworkScanRepo) Delete(ctx context.Context, id string) error { + for i, t := range m.targets { + if t.ID == id { + m.targets = append(m.targets[:i], m.targets[i+1:]...) + return nil + } + } + return fmt.Errorf("not found: %s", id) +} + +func (m *mockNetworkScanRepo) UpdateScanResults(ctx context.Context, id string, scanAt time.Time, durationMs int, certsFound int) error { + for _, t := range m.targets { + if t.ID == id { + t.LastScanAt = &scanAt + d := durationMs + t.LastScanDurationMs = &d + c := certsFound + t.LastScanCertsFound = &c + return nil + } + } + return fmt.Errorf("not found: %s", id) +} + +func TestExpandCIDR_SingleIP(t *testing.T) { + ips := expandCIDR("192.168.1.1") + if len(ips) != 1 || ips[0] != "192.168.1.1" { + t.Errorf("expected [192.168.1.1], got %v", ips) + } +} + +func TestExpandCIDR_Slash30(t *testing.T) { + // /30 = 4 total addresses, 2 usable (remove network + broadcast) + ips := expandCIDR("10.0.0.0/30") + if len(ips) != 2 { + t.Errorf("expected 2 usable IPs for /30, got %d: %v", len(ips), ips) + } +} + +func TestExpandCIDR_Slash24(t *testing.T) { + ips := expandCIDR("10.0.0.0/24") + if len(ips) != 254 { + t.Errorf("expected 254 usable IPs for /24, got %d", len(ips)) + } +} + +func TestExpandCIDR_TooLarge(t *testing.T) { + // /16 = 65536 IPs, exceeds /20 cap + ips := expandCIDR("10.0.0.0/16") + if len(ips) != 0 { + t.Errorf("expected empty for /16 (too large), got %d", len(ips)) + } +} + +func TestExpandCIDR_InvalidInput(t *testing.T) { + ips := expandCIDR("not-a-cidr") + if len(ips) != 0 { + t.Errorf("expected empty for invalid input, got %v", ips) + } +} + +func TestNetworkScanService_CreateTarget(t *testing.T) { + repo := &mockNetworkScanRepo{} + auditRepo := newMockAuditRepository() + auditService := NewAuditService(auditRepo) + + svc := NewNetworkScanService(repo, nil, auditService, nil) + + target, err := svc.CreateTarget(context.Background(), &domain.NetworkScanTarget{ + Name: "Test Network", + CIDRs: []string{"10.0.0.0/24"}, + Ports: []int{443, 8443}, + }) + if err != nil { + t.Fatalf("CreateTarget failed: %v", err) + } + if target.ID == "" { + t.Error("expected non-empty ID") + } + if !target.Enabled { + t.Error("expected target to be enabled by default") + } + if target.ScanIntervalHours != 6 { + t.Errorf("expected default interval 6h, got %d", target.ScanIntervalHours) + } + if target.TimeoutMs != 5000 { + t.Errorf("expected default timeout 5000ms, got %d", target.TimeoutMs) + } +} + +func TestNetworkScanService_CreateTarget_ValidationErrors(t *testing.T) { + repo := &mockNetworkScanRepo{} + auditRepo := newMockAuditRepository() + auditService := NewAuditService(auditRepo) + svc := NewNetworkScanService(repo, nil, auditService, nil) + + tests := []struct { + name string + target *domain.NetworkScanTarget + errMsg string + }{ + { + name: "missing name", + target: &domain.NetworkScanTarget{CIDRs: []string{"10.0.0.0/24"}}, + errMsg: "name is required", + }, + { + name: "missing cidrs", + target: &domain.NetworkScanTarget{Name: "test"}, + errMsg: "at least one CIDR is required", + }, + { + name: "invalid cidr", + target: &domain.NetworkScanTarget{Name: "test", CIDRs: []string{"not-valid"}}, + errMsg: "invalid CIDR or IP", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := svc.CreateTarget(context.Background(), tt.target) + if err == nil { + t.Fatal("expected error") + } + if !containsSubstring(err.Error(), tt.errMsg) { + t.Errorf("expected error containing %q, got %q", tt.errMsg, err.Error()) + } + }) + } +} + +func TestNetworkScanService_DeleteTarget(t *testing.T) { + repo := &mockNetworkScanRepo{ + targets: []*domain.NetworkScanTarget{ + {ID: "nst-1", Name: "test"}, + }, + } + auditRepo := newMockAuditRepository() + auditService := NewAuditService(auditRepo) + svc := NewNetworkScanService(repo, nil, auditService, nil) + + if err := svc.DeleteTarget(context.Background(), "nst-1"); err != nil { + t.Fatalf("DeleteTarget failed: %v", err) + } + if len(repo.targets) != 0 { + t.Error("expected target to be deleted") + } +} + +func TestNetworkScanService_ListTargets(t *testing.T) { + repo := &mockNetworkScanRepo{ + targets: []*domain.NetworkScanTarget{ + {ID: "nst-1", Name: "target1"}, + {ID: "nst-2", Name: "target2"}, + }, + } + svc := NewNetworkScanService(repo, nil, nil, nil) + + targets, err := svc.ListTargets(context.Background()) + if err != nil { + t.Fatalf("ListTargets failed: %v", err) + } + if len(targets) != 2 { + t.Errorf("expected 2 targets, got %d", len(targets)) + } +} + +func TestExpandEndpoints(t *testing.T) { + svc := &NetworkScanService{} + endpoints := svc.expandEndpoints([]string{"192.168.1.1"}, []int{443, 8443}) + if len(endpoints) != 2 { + t.Errorf("expected 2 endpoints, got %d: %v", len(endpoints), endpoints) + } + if endpoints[0] != "192.168.1.1:443" { + t.Errorf("expected 192.168.1.1:443, got %s", endpoints[0]) + } + if endpoints[1] != "192.168.1.1:8443" { + t.Errorf("expected 192.168.1.1:8443, got %s", endpoints[1]) + } +} + +// containsSubstring checks if a string contains a substring (helper) +func containsSubstring(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +} diff --git a/migrations/000007_network_discovery.down.sql b/migrations/000007_network_discovery.down.sql new file mode 100644 index 0000000..7857a71 --- /dev/null +++ b/migrations/000007_network_discovery.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS network_scan_targets; diff --git a/migrations/000007_network_discovery.up.sql b/migrations/000007_network_discovery.up.sql new file mode 100644 index 0000000..0a8eca0 --- /dev/null +++ b/migrations/000007_network_discovery.up.sql @@ -0,0 +1,21 @@ +-- Migration 000007: Network Discovery (Active TLS Scanning) +-- The control plane actively scans network endpoints for TLS certificates. +-- Results feed into the existing discovery pipeline (discovered_certificates table). + +-- Network scan targets define CIDR ranges and ports to probe for TLS certificates +CREATE TABLE IF NOT EXISTS network_scan_targets ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + cidrs TEXT[] NOT NULL DEFAULT '{}', + ports INTEGER[] NOT NULL DEFAULT '{443}', + enabled BOOLEAN NOT NULL DEFAULT TRUE, + scan_interval_hours INTEGER NOT NULL DEFAULT 6, + timeout_ms INTEGER NOT NULL DEFAULT 5000, + last_scan_at TIMESTAMPTZ, + last_scan_duration_ms INTEGER, + last_scan_certs_found INTEGER, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_network_scan_targets_enabled ON network_scan_targets(enabled) WHERE enabled = TRUE;