From c8985cf868f02ac5b42f9cfe6b2e93b8686d7742 Mon Sep 17 00:00:00 2001 From: shankar0123 Date: Thu, 14 May 2026 13:26:47 +0000 Subject: [PATCH] =?UTF-8?q?fix(ratelimit):=20Hotfix=20#5=20=E2=80=94=20Pos?= =?UTF-8?q?tgres=20timestamptz[]=20scan=20+=20skip-inventory=20drift?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two CI hotfixes surfaced by master CI on 29cb13e7 (Sprint 13.6 tip before the Sprint 13.7 closure landed): 1. TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas failed with "pq: scanning to time.Time is not implemented; only sql.Scanner". Root cause: time.Time does not implement sql.Scanner, and lib/pq's pq.GenericArray scan path calls element-Scan() directly rather than database/sql's convertAssign (which DOES support time conversions). So `pq.Array(&[]time.Time{})` reliably fails on read even though the symmetric write `pq.Array([]time.Time{...})` works (the write path uses driver.Value() which time.Time implements). Fix: cast the timestamptz[] to a text[] of canonical ISO 8601 UTC strings at the SQL boundary via to_char(t AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.US"Z"'), read via pq.StringArray (well- supported), and parse Go-side with layout "2006-01-02T15:04:05.000000Z". The format is fully deterministic regardless of the session's DateStyle or TimeZone settings. Touched: internal/ratelimit/postgres_sliding_window.go (Step 2 of the Allow() transaction — locking + read). Falsifiable proof on CI: the failing test TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas (100 concurrent Allow calls / 3 replicas / cap=10) must now produce exactly 10 succeed / 90 ErrRateLimited. Pre-fix it produced 1 / 0 because every Allow after the first crashed on Scan. 2. skip-inventory-drift.sh CI guard turned red because Sprint 13.2 added two new t.Skip sites: internal/ratelimit/equivalence_test.go:80 t.Skip("race-style test under -short") internal/ratelimit/equivalence_test.go:88 t.Skip("postgres equivalence tests require testcontainers; skipped under -short") The inventory at docs/testing/skip-inventory.md is auto-generated by scripts/skip-inventory.sh and must be re-generated alongside any t.Skip churn. Sprint 13.2 missed the regeneration. Fix: re-ran scripts/skip-inventory.sh. Totals walked 142 → 144 sites; testing.Short() guards 76 → 78. The two new entries land in the internal/ratelimit section. Verification (local sandbox, all clean): $ bash scripts/ci-guards/skip-inventory-drift.sh skip-inventory-drift guard OK: docs/testing/skip-inventory.md matches the live tree $ bash scripts/ci-guards/openapi-handler-parity.sh openapi-handler-parity: clean. $ bash scripts/ci-guards/openapi-rest-deferred-monotonic.sh openapi-rest-deferred-monotonic: clean — rest-deferred = 0, baseline = 0. $ gofmt -l internal/ratelimit/postgres_sliding_window.go (no output) $ go vet ./internal/ratelimit/ (no output) The Postgres rate-limit fix's full falsifiable proof (TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas) cannot be exercised in the sandbox (no docker for testcontainers); CI on the amd64 runner will re-run it on this push. The diagnosis is verified against lib/pq source semantics and the fix uses only well-supported primitives (pq.StringArray + canonical to_char output + time.Parse). --- docs/testing/skip-inventory.md | 8 +++-- internal/ratelimit/postgres_sliding_window.go | 36 +++++++++++++++---- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/docs/testing/skip-inventory.md b/docs/testing/skip-inventory.md index 36b0e5e..a352dcb 100644 --- a/docs/testing/skip-inventory.md +++ b/docs/testing/skip-inventory.md @@ -4,12 +4,12 @@ -> Last reviewed: 2026-05-13 +> Last reviewed: 2026-05-14 ## Summary -- Total t.Skip sites: **142** -- testing.Short() guards: **76** (these gate behind `go test -short`) +- Total t.Skip sites: **144** +- testing.Short() guards: **78** (these gate behind `go test -short`) Re-run inventory with: `./scripts/skip-inventory.sh`. @@ -156,6 +156,8 @@ Re-run inventory with: `./scripts/skip-inventory.sh`. ### `internal/ratelimit` +- `internal/ratelimit/equivalence_test.go:80` — t.Skip("race-style test under -short") +- `internal/ratelimit/equivalence_test.go:88` — t.Skip("postgres equivalence tests require testcontainers; skipped under -short") - `internal/ratelimit/sliding_window_test.go:146` — t.Skip("race-style test under -short") ### `internal/repository/postgres` diff --git a/internal/ratelimit/postgres_sliding_window.go b/internal/ratelimit/postgres_sliding_window.go index 8ff6b74..e54c5eb 100644 --- a/internal/ratelimit/postgres_sliding_window.go +++ b/internal/ratelimit/postgres_sliding_window.go @@ -146,16 +146,32 @@ func (l *PostgresSlidingWindowLimiter) Allow(key string, now time.Time) error { return fmt.Errorf("ratelimit: ensure row: %w", err) } - // Step 2: lock the row + read current state. - var existing pq.GenericArray - var ts []time.Time - existing.A = &ts + // Step 2: lock the row + read current state. lib/pq cannot scan a + // TIMESTAMPTZ[] column back into []time.Time directly: time.Time + // does not implement sql.Scanner, and pq.GenericArray's per-element + // scan path calls Scan() (not database/sql's convertAssign), so the + // inner Scan fails with + // "pq: scanning to time.Time is not implemented; only sql.Scanner". + // Workaround: ask Postgres to format each timestamp as a canonical + // ISO 8601 UTC string via to_char(... AT TIME ZONE 'UTC', ...), read + // the column as text[] via pq.StringArray (well-supported), and + // parse Go-side. The to_char format is fully deterministic (6-digit + // microseconds, "T" separator, "Z" suffix) regardless of the + // session's DateStyle / TimeZone settings. + const pgTimestampLayout = "2006-01-02T15:04:05.000000Z" + var tsStrings pq.StringArray if err := tx.QueryRowContext(ctx, ` - SELECT COALESCE(timestamps, '{}'::timestamptz[]) + SELECT COALESCE( + ARRAY( + SELECT to_char(t AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.US"Z"') + FROM unnest(timestamps) AS t + ), + ARRAY[]::text[] + ) FROM rate_limit_buckets WHERE bucket_key = $1 FOR UPDATE - `, key).Scan(&existing); err != nil { + `, key).Scan(&tsStrings); err != nil { // Shouldn't happen — step 1 ensured the row exists. Treat // the sql.ErrNoRows path as a no-op (be conservative; never // over-limit on transient DB weirdness). @@ -164,6 +180,14 @@ func (l *PostgresSlidingWindowLimiter) Allow(key string, now time.Time) error { } return fmt.Errorf("ratelimit: select-for-update: %w", err) } + ts := make([]time.Time, 0, len(tsStrings)) + for _, s := range tsStrings { + parsed, err := time.Parse(pgTimestampLayout, s) + if err != nil { + return fmt.Errorf("ratelimit: parse stored timestamp %q: %w", s, err) + } + ts = append(ts, parsed.UTC()) + } // Step 3: prune in Go via the shared helper. Same prune semantics // as SlidingWindowLimiter — single source of truth.