fix(repository): idempotent sentinel agent creation via ON CONFLICT (M-6)

Sentinel agents (server-scanner, cloud-aws-sm, cloud-azure-kv,
cloud-gcp-sm) were created on startup with a plain INSERT whose
duplicate-key error was swallowed unconditionally. That silenced every
other DB failure too (connectivity drop, permissions change, unrelated
constraint violation) — a restart after the first boot quietly
de-fanged cloud discovery and the network scanner (CWE-662, CWE-209-
adjacent).

Shape A: add AgentRepository.CreateIfNotExists using ON CONFLICT (id)
DO NOTHING RETURNING id + sql.ErrNoRows discrimination. This keeps the
strict Create semantics (duplicate-key is an error) intact for real
agent registration and gives sentinels their own idempotent path.

- repo: CreateIfNotExists returns (created bool, err error); false,nil
  on pre-existing row; false,wrapped err on anything else.
- interface: CreateIfNotExists added to AgentRepository.
- main.go: 4 sentinel sites log Error/Info/Debug distinctly.
- mocks: service + integration mocks implement the new method.
- tests: 4 new testcontainers integration tests cover first-insert,
  idempotent second-call, concurrent 16-goroutine race (exactly one
  creator, no duplicate-key panic), and pre-cancelled context
  surfacing.

Coverage gates (go test -cover): service 67.6%/55, handler 78.6%/60,
domain 92.7%/40, middleware 80.0%/30, crypto 86.7%/85. Race/vet/
golangci-lint v2.11.4 (0 issues)/govulncheck v1.2.0 clean across all
touched packages.
This commit is contained in:
shankar0123
2026-04-17 16:32:07 +00:00
parent 80450c7180
commit 27afa4463d
6 changed files with 293 additions and 11 deletions
+33 -9
View File
@@ -253,9 +253,15 @@ func main() {
Name: "Network Scanner (Server-Side)",
Status: domain.AgentStatusOnline,
}
if err := agentRepo.Create(context.Background(), sentinelAgent); err != nil {
// Ignore duplicate key errors (agent already exists)
logger.Debug("sentinel agent creation", "status", "exists or created", "id", service.SentinelAgentID)
// M-6: use CreateIfNotExists so duplicate rows on restart/upgrade are
// idempotent without swallowing unrelated DB failures (CWE-662).
created, err := agentRepo.CreateIfNotExists(context.Background(), sentinelAgent)
if err != nil {
logger.Error("sentinel agent creation failed", "id", service.SentinelAgentID, "error", err)
} else if created {
logger.Info("sentinel agent created", "id", service.SentinelAgentID)
} else {
logger.Debug("sentinel agent already exists", "id", service.SentinelAgentID)
}
}
@@ -274,8 +280,14 @@ func main() {
Name: "AWS Secrets Manager Discovery",
Status: domain.AgentStatusOnline,
}
if err := agentRepo.Create(context.Background(), sentinelAWS); err != nil {
logger.Debug("sentinel agent creation", "status", "exists or created", "id", service.SentinelAWSSecretsMgr)
// M-6: idempotent create (CWE-662).
created, err := agentRepo.CreateIfNotExists(context.Background(), sentinelAWS)
if err != nil {
logger.Error("sentinel agent creation failed", "id", service.SentinelAWSSecretsMgr, "error", err)
} else if created {
logger.Info("sentinel agent created", "id", service.SentinelAWSSecretsMgr)
} else {
logger.Debug("sentinel agent already exists", "id", service.SentinelAWSSecretsMgr)
}
}
@@ -293,8 +305,14 @@ func main() {
Name: "Azure Key Vault Discovery",
Status: domain.AgentStatusOnline,
}
if err := agentRepo.Create(context.Background(), sentinelAzure); err != nil {
logger.Debug("sentinel agent creation", "status", "exists or created", "id", service.SentinelAzureKeyVault)
// M-6: idempotent create (CWE-662).
created, err := agentRepo.CreateIfNotExists(context.Background(), sentinelAzure)
if err != nil {
logger.Error("sentinel agent creation failed", "id", service.SentinelAzureKeyVault, "error", err)
} else if created {
logger.Info("sentinel agent created", "id", service.SentinelAzureKeyVault)
} else {
logger.Debug("sentinel agent already exists", "id", service.SentinelAzureKeyVault)
}
}
@@ -307,8 +325,14 @@ func main() {
Name: "GCP Secret Manager Discovery",
Status: domain.AgentStatusOnline,
}
if err := agentRepo.Create(context.Background(), sentinelGCP); err != nil {
logger.Debug("sentinel agent creation", "status", "exists or created", "id", service.SentinelGCPSecretMgr)
// M-6: idempotent create (CWE-662).
created, err := agentRepo.CreateIfNotExists(context.Background(), sentinelGCP)
if err != nil {
logger.Error("sentinel agent creation failed", "id", service.SentinelGCPSecretMgr, "error", err)
} else if created {
logger.Info("sentinel agent created", "id", service.SentinelGCPSecretMgr)
} else {
logger.Debug("sentinel agent already exists", "id", service.SentinelGCPSecretMgr)
}
}
+8
View File
@@ -772,6 +772,14 @@ func (m *mockAgentRepository) Create(ctx context.Context, agent *domain.Agent) e
return nil
}
func (m *mockAgentRepository) CreateIfNotExists(ctx context.Context, agent *domain.Agent) (bool, error) {
if _, exists := m.agents[agent.ID]; exists {
return false, nil
}
m.agents[agent.ID] = agent
return true, nil
}
func (m *mockAgentRepository) Update(ctx context.Context, agent *domain.Agent) error {
m.agents[agent.ID] = agent
return nil
+11 -1
View File
@@ -90,8 +90,18 @@ type AgentRepository interface {
List(ctx context.Context) ([]*domain.Agent, error)
// Get retrieves an agent by ID.
Get(ctx context.Context, id string) (*domain.Agent, error)
// Create stores a new agent.
// Create stores a new agent. Callers that want duplicate-key errors surfaced
// (e.g. real-agent registration) must use this method; sentinel/bootstrap
// paths that expect the row to already exist on restart should call
// CreateIfNotExists instead (M-6, CWE-662).
Create(ctx context.Context, agent *domain.Agent) error
// CreateIfNotExists creates an agent only if the ID doesn't already exist
// (INSERT ... ON CONFLICT (id) DO NOTHING). Returns true if the row was
// newly inserted, false if a row with the same ID already existed. Used
// by the sentinel-agent bootstrap path in cmd/server/main.go so restarts
// and upgrades are idempotent without swallowing unrelated database
// failures (M-6, CWE-662).
CreateIfNotExists(ctx context.Context, agent *domain.Agent) (bool, error)
// Update modifies an existing agent.
Update(ctx context.Context, agent *domain.Agent) error
// Delete removes an agent.
+41 -1
View File
@@ -70,7 +70,9 @@ func (r *AgentRepository) Get(ctx context.Context, id string) (*domain.Agent, er
return agent, nil
}
// Create stores a new agent
// Create stores a new agent. Duplicate-key errors surface to the caller —
// real-agent registration paths rely on this to detect collisions. Use
// CreateIfNotExists for sentinel/bootstrap paths where re-inserts are expected.
func (r *AgentRepository) Create(ctx context.Context, agent *domain.Agent) error {
if agent.ID == "" {
agent.ID = uuid.New().String()
@@ -92,6 +94,44 @@ func (r *AgentRepository) Create(ctx context.Context, agent *domain.Agent) error
return nil
}
// CreateIfNotExists creates an agent only if the ID doesn't already exist.
// Used for sentinel agents (server-scanner, cloud-aws-sm, cloud-azure-kv,
// cloud-gcp-sm) on first boot AND on every subsequent restart/upgrade — the
// pre-M-6 code used plain INSERT, swallowed the duplicate-key error, and so
// silently swallowed every other database failure too (CWE-662 /
// CWE-209-adjacent). ON CONFLICT (id) DO NOTHING + RETURNING id +
// sql.ErrNoRows distinguishes "row already existed" (created=false, err=nil)
// from genuine errors (connectivity, permission, constraint violations
// other than the id primary key) which still surface. Returns true if the
// row was newly inserted, false if a row with the same ID already existed.
func (r *AgentRepository) CreateIfNotExists(ctx context.Context, agent *domain.Agent) (bool, error) {
if agent.ID == "" {
agent.ID = uuid.New().String()
}
var id string
err := r.db.QueryRowContext(ctx, `
INSERT INTO agents (id, name, hostname, status, last_heartbeat_at, registered_at, api_key_hash,
os, architecture, ip_address, version)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
ON CONFLICT (id) DO NOTHING
RETURNING id
`, agent.ID, agent.Name, agent.Hostname, agent.Status, agent.LastHeartbeatAt,
agent.RegisteredAt, agent.APIKeyHash,
agent.OS, agent.Architecture, agent.IPAddress, agent.Version).Scan(&id)
if err != nil {
if err == sql.ErrNoRows {
// ON CONFLICT DO NOTHING — a row with this ID already existed.
return false, nil
}
return false, fmt.Errorf("failed to create agent: %w", err)
}
agent.ID = id
return true, nil
}
// Update modifies an existing agent
func (r *AgentRepository) Update(ctx context.Context, agent *domain.Agent) error {
result, err := r.db.ExecContext(ctx, `
+187
View File
@@ -457,6 +457,193 @@ func TestAgentRepository_Delete_NotFound(t *testing.T) {
}
}
// TestAgentRepository_CreateIfNotExists_FirstInsert verifies that a brand-new
// sentinel agent row is inserted and the helper reports created=true (M-6).
func TestAgentRepository_CreateIfNotExists_FirstInsert(t *testing.T) {
tdb := getTestDB(t)
db := tdb.freshSchema(t)
repo := postgres.NewAgentRepository(db)
ctx := context.Background()
now := time.Now().Truncate(time.Microsecond)
agent := &domain.Agent{
ID: "server-scanner",
Name: "Network Scanner (Server-Side)",
Status: domain.AgentStatusOnline,
RegisteredAt: now,
}
created, err := repo.CreateIfNotExists(ctx, agent)
if err != nil {
t.Fatalf("CreateIfNotExists failed: %v", err)
}
if !created {
t.Error("created = false on first insert, want true")
}
got, err := repo.Get(ctx, "server-scanner")
if err != nil {
t.Fatalf("Get failed: %v", err)
}
if got.Name != "Network Scanner (Server-Side)" {
t.Errorf("Name = %q, want %q", got.Name, "Network Scanner (Server-Side)")
}
}
// TestAgentRepository_CreateIfNotExists_Idempotent verifies that a second
// call with the same ID returns created=false and err=nil without mutating
// the existing row — the core M-6 upgrade/restart scenario (CWE-662).
func TestAgentRepository_CreateIfNotExists_Idempotent(t *testing.T) {
tdb := getTestDB(t)
db := tdb.freshSchema(t)
repo := postgres.NewAgentRepository(db)
ctx := context.Background()
now := time.Now().Truncate(time.Microsecond)
first := &domain.Agent{
ID: "cloud-aws-sm",
Name: "AWS Secrets Manager Discovery",
Status: domain.AgentStatusOnline,
RegisteredAt: now,
}
created, err := repo.CreateIfNotExists(ctx, first)
if err != nil {
t.Fatalf("first CreateIfNotExists failed: %v", err)
}
if !created {
t.Fatal("first created = false, want true")
}
// Second call with the same ID but a different name must be a no-op.
second := &domain.Agent{
ID: "cloud-aws-sm",
Name: "Overwritten Name Should Not Persist",
Status: domain.AgentStatusOffline,
RegisteredAt: now.Add(time.Hour),
}
created, err = repo.CreateIfNotExists(ctx, second)
if err != nil {
t.Fatalf("second CreateIfNotExists failed: %v", err)
}
if created {
t.Error("second created = true, want false (row already existed)")
}
// Row must still reflect the original insert.
got, err := repo.Get(ctx, "cloud-aws-sm")
if err != nil {
t.Fatalf("Get failed: %v", err)
}
if got.Name != "AWS Secrets Manager Discovery" {
t.Errorf("Name = %q, want %q (ON CONFLICT DO NOTHING must preserve original row)", got.Name, "AWS Secrets Manager Discovery")
}
if got.Status != domain.AgentStatusOnline {
t.Errorf("Status = %q, want %q", got.Status, domain.AgentStatusOnline)
}
}
// TestAgentRepository_CreateIfNotExists_ConcurrentRace fires N concurrent
// inserts for the same sentinel ID. Exactly one goroutine must see
// created=true; every other must see created=false and err=nil. No panics,
// no duplicate rows, no swallowed errors. This is the scenario that the
// pre-M-6 plain-INSERT path masked with a blanket error log.
func TestAgentRepository_CreateIfNotExists_ConcurrentRace(t *testing.T) {
tdb := getTestDB(t)
db := tdb.freshSchema(t)
repo := postgres.NewAgentRepository(db)
ctx := context.Background()
const N = 16
now := time.Now().Truncate(time.Microsecond)
var (
wg sync.WaitGroup
createdCount int64
errorCount int64
)
wg.Add(N)
for i := 0; i < N; i++ {
go func() {
defer wg.Done()
agent := &domain.Agent{
ID: "cloud-gcp-sm",
Name: "GCP Secret Manager Discovery",
Status: domain.AgentStatusOnline,
RegisteredAt: now,
}
created, err := repo.CreateIfNotExists(ctx, agent)
if err != nil {
atomic.AddInt64(&errorCount, 1)
t.Errorf("CreateIfNotExists returned error: %v", err)
return
}
if created {
atomic.AddInt64(&createdCount, 1)
}
}()
}
wg.Wait()
if errorCount != 0 {
t.Fatalf("errorCount = %d, want 0", errorCount)
}
if createdCount != 1 {
t.Errorf("createdCount = %d, want exactly 1 (only one goroutine may win the insert)", createdCount)
}
// Exactly one row must exist.
agents, err := repo.List(ctx)
if err != nil {
t.Fatalf("List failed: %v", err)
}
count := 0
for _, a := range agents {
if a.ID == "cloud-gcp-sm" {
count++
}
}
if count != 1 {
t.Errorf("row count for cloud-gcp-sm = %d, want 1", count)
}
}
// TestAgentRepository_CreateIfNotExists_GenericErrorSurfaces verifies that
// failures other than the primary-key duplicate (the only collision
// ON CONFLICT (id) absorbs) propagate to the caller instead of being
// swallowed. This is the security property that M-6 restores: the
// pre-fix plain-INSERT path logged every error at Debug level, so a
// connectivity or permission failure would vanish into the log without
// the server surfacing a problem on startup (CWE-662 / CWE-209-adjacent).
//
// Uses a pre-cancelled context to force QueryRowContext to fail with
// context.Canceled — a non-duplicate error class that must surface.
// Does NOT close the shared sql.DB (that would break sibling tests).
func TestAgentRepository_CreateIfNotExists_GenericErrorSurfaces(t *testing.T) {
tdb := getTestDB(t)
db := tdb.freshSchema(t)
repo := postgres.NewAgentRepository(db)
ctx, cancel := context.WithCancel(context.Background())
cancel() // pre-cancel so the driver round-trip fails immediately.
agent := &domain.Agent{
ID: "server-scanner",
Name: "Network Scanner (Server-Side)",
Status: domain.AgentStatusOnline,
RegisteredAt: time.Now(),
}
created, err := repo.CreateIfNotExists(ctx, agent)
if err == nil {
t.Fatal("expected error on cancelled context, got nil (error would have been swallowed pre-M-6)")
}
if created {
t.Error("created = true on failure, want false")
}
if err == sql.ErrNoRows {
t.Error("got sql.ErrNoRows, want a real connection/context error (ErrNoRows is the duplicate-row sentinel)")
}
}
// ============================================================
// Issuer Repository Tests
// ============================================================
+13
View File
@@ -602,6 +602,19 @@ func (m *mockAgentRepo) Create(ctx context.Context, agent *domain.Agent) error {
return nil
}
func (m *mockAgentRepo) CreateIfNotExists(ctx context.Context, agent *domain.Agent) (bool, error) {
m.mu.Lock()
defer m.mu.Unlock()
if m.CreateErr != nil {
return false, m.CreateErr
}
if _, exists := m.Agents[agent.ID]; exists {
return false, nil
}
m.Agents[agent.ID] = agent
return true, nil
}
func (m *mockAgentRepo) Update(ctx context.Context, agent *domain.Agent) error {
m.mu.Lock()
defer m.mu.Unlock()