From b9478d064ca14ed1f6dfed97d496e87d013a998f Mon Sep 17 00:00:00 2001
From: shankar0123 <skreddy040@gmail.com>
Date: Mon, 4 May 2026 04:56:13 +0000
Subject: [PATCH] security(email): sanitize body fields against content
 injection (CodeQL #11, CWE-640)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CodeQL alert #11 (go/email-injection, CWE-640 / OWASP Content Spoofing)
flagged the wc.Write(message) sink at internal/connector/notifier/email/
email.go:208 because attacker-controllable fields flow into the email
body unchecked.

Threat model:
  Headers (From, To, Subject) were already protected by
  validation.ValidateHeaderValue (CWE-113 SMTP header injection,
  closed in commit 9e957c3). The remaining gap was the body.
  An attacker controls multiple fields that surface to the body of
  alert/event notifications:
    - alert.Subject, alert.Message
    - event.Subject, event.Body, *event.CertificateID
    - alert.Metadata + event.Metadata key/value pairs
  These can carry CR/LF (forged 'Reply-To: attacker@evil.com' inside
  the body that recipients skim), NUL bytes (RFC 5321 4.5.2 violation
  that some MTAs truncate at), bidi-override Unicode (visually-
  spoofable URLs), zero-width / invisible Unicode (phishing), or
  malformed UTF-8 (Go emits U+FFFD which becomes a glyph in mail
  clients).

  The HTML email path (digest service) already uses html/template
  upstream and is safe via contextual auto-escape. This commit
  closes the plaintext path.

Fix:
  internal/validation/headers.go gains SanitizeEmailBodyValue —
  a sanitizer that NEVER errors (the right contract for body
  content; over-eager rejection drops operator notifications) and
  scrubs:
    - NUL bytes (stripped entirely)
    - bare CR / LF (replaced with space — single fields should never
      carry their own line breaks; the surrounding template handles
      legitimate CRLFs)
    - C0 control chars < 0x20 except TAB
    - DEL (0x7F) + C1 control chars (0x80-0x9F)
    - U+FFFD (defense in depth: malformed UTF-8 -> Go emits this;
      strip so attacker-planted invalid bytes don't survive as an
      arbitrary glyph)
    - Bidi-override Unicode (U+202A..U+202E, U+2066..U+2069)
    - Zero-width / invisible Unicode (U+200B..U+200D, U+2060..U+2063,
      U+FEFF, U+180E)
    - Catch-all unicode.IsControl for anything not enumerated above
  Codepoint table uses numeric ranges rather than rune-literal switch
  cases — Go source rejects literal invisible characters (BOM U+FEFF)
  mid-file, so the table compares against numeric values.

  internal/connector/notifier/email/email.go applies the sanitizer
  at every interpolation site:
    - formatAlertBody: alert.ID/Type/Severity/Subject/Message
      (CreatedAt is time.Time -> RFC3339, structural, not sanitized)
    - formatEventBody: event.ID/Type/Subject/Body, *CertificateID
      (CreatedAt structural, not sanitized)
    - formatMetadata: both keys and values
  The sendEmail / formatEmailMessage call sites continue to validate
  headers (From / To / Subject) via the existing ValidateHeaderValue
  fail-closed gate; the new sanitizer is body-side only.

Tests (internal/validation/headers_test.go):
  TestSanitizeEmailBodyValue_PreservesSafeInput
    Pin: ordinary ASCII, UTF-8 multibyte (résumé / 日本語 / مرحبا),
    tabs, common cert DNs, URLs all flow through unchanged.
  TestSanitizeEmailBodyValue_StripsControlChars
    Table-driven across NUL, bare LF/CR, CRLF, BEL, backspace, DEL,
    C1 (U+0080 / U+009F), U+FFFD, TAB-preserve.
  TestSanitizeEmailBodyValue_StripsBidiOverride
    7 attacker payloads (RLO, LRO, LRI, zero-width space, ZWNJ, BOM,
    MVS) — each must produce a non-identity output.
  TestSanitizeEmailBodyValue_ContentSpoofingScenario
    The CodeQL example case: 'alert\r\nReply-To: attacker@evil.com\r\n
    Click https://evil.example.com/reset' — verify NO CR/LF survives.

Verified locally:
  gofmt: clean.
  go vet ./...: exit 0.
  go test -short -count=1 ./internal/validation/...: ok 0.374s
  go test -short -count=1 ./internal/connector/notifier/email/...: ok 0.186s

Reference: https://github.com/certctl-io/certctl/security/code-scanning/11
Closes CodeQL alert #11 (go/email-injection).
---
 internal/connector/notifier/email/email.go |  54 ++++++++-
 internal/validation/headers.go             | 123 ++++++++++++++++++++
 internal/validation/headers_test.go        | 124 +++++++++++++++++++++
 3 files changed, 297 insertions(+), 4 deletions(-)

diff --git a/internal/connector/notifier/email/email.go b/internal/connector/notifier/email/email.go
index b110183..dab350e 100644
--- a/internal/connector/notifier/email/email.go
+++ b/internal/connector/notifier/email/email.go
@@ -356,6 +356,21 @@ func (c *Connector) formatHTMLEmailMessage(from, to, subject, htmlBody string) (
 }
 
 // formatAlertBody formats an alert notification as email body text.
+//
+// CodeQL go/email-injection (CWE-640 / OWASP Content Spoofing) defense:
+// every field interpolated into the body that may carry attacker-
+// controlled content (alert.Subject, alert.Message, alert.Metadata
+// values, alert.ID / Type / Severity which originate from the API
+// surface) is routed through validation.SanitizeEmailBodyValue before
+// formatting. The sanitizer strips NUL bytes (RFC 5321 §4.5.2 violation),
+// bare CR/LF within a single field (forged header-boundary attempts),
+// bidi-override Unicode (visually-spoofable URLs), zero-width / invisible
+// codepoints, and C0/C1 control chars. CreatedAt is a time.Time —
+// formatted via RFC3339; not user-controllable so unsanitized.
+//
+// Header values (From, To, Subject) are protected separately by
+// validation.ValidateHeaderValue at sendEmail entry (CWE-113 SMTP header
+// injection — see commit 9e957c3).
 func (c *Connector) formatAlertBody(alert notifier.Alert) string {
 	body := fmt.Sprintf(`
 Certificate Alert Notification
@@ -372,16 +387,29 @@ Message:
 %s
 
 %s
-`, alert.ID, alert.Type, alert.Severity, alert.CreatedAt.Format(time.RFC3339), alert.Subject, alert.Message, c.formatMetadata(alert.Metadata))
+`,
+		validation.SanitizeEmailBodyValue(alert.ID),
+		validation.SanitizeEmailBodyValue(alert.Type),
+		validation.SanitizeEmailBodyValue(alert.Severity),
+		alert.CreatedAt.Format(time.RFC3339),
+		validation.SanitizeEmailBodyValue(alert.Subject),
+		validation.SanitizeEmailBodyValue(alert.Message),
+		c.formatMetadata(alert.Metadata),
+	)
 
 	return body
 }
 
 // formatEventBody formats an event notification as email body text.
+//
+// Same CodeQL go/email-injection mitigation as formatAlertBody — every
+// user-controllable interpolated field routes through
+// validation.SanitizeEmailBodyValue. CreatedAt is unsanitized (time.Time
+// → RFC3339 is structural, not user-controllable).
 func (c *Connector) formatEventBody(event notifier.Event) string {
 	certInfo := ""
 	if event.CertificateID != nil {
-		certInfo = fmt.Sprintf("Certificate ID: %s\n", *event.CertificateID)
+		certInfo = fmt.Sprintf("Certificate ID: %s\n", validation.SanitizeEmailBodyValue(*event.CertificateID))
 	}
 
 	body := fmt.Sprintf(`
@@ -398,12 +426,27 @@ Body:
 %s
 
 %s
-`, event.ID, event.Type, event.CreatedAt.Format(time.RFC3339), certInfo, event.Subject, event.Body, c.formatMetadata(event.Metadata))
+`,
+		validation.SanitizeEmailBodyValue(event.ID),
+		validation.SanitizeEmailBodyValue(event.Type),
+		event.CreatedAt.Format(time.RFC3339),
+		certInfo,
+		validation.SanitizeEmailBodyValue(event.Subject),
+		validation.SanitizeEmailBodyValue(event.Body),
+		c.formatMetadata(event.Metadata),
+	)
 
 	return body
 }
 
 // formatMetadata formats metadata as a readable string.
+//
+// Both keys and values can carry attacker-controlled content (cert
+// subject DN fragments, discovered cert metadata, owner/team labels —
+// all originate from API surfaces an attacker may influence). Both are
+// routed through validation.SanitizeEmailBodyValue. Closes the
+// CodeQL go/email-injection finding alongside formatAlertBody +
+// formatEventBody.
 func (c *Connector) formatMetadata(metadata map[string]string) string {
 	if len(metadata) == 0 {
 		return ""
@@ -411,7 +454,10 @@ func (c *Connector) formatMetadata(metadata map[string]string) string {
 
 	metadataStr := "\nMetadata:\n"
 	for key, value := range metadata {
-		metadataStr += fmt.Sprintf("  %s: %s\n", key, value)
+		metadataStr += fmt.Sprintf("  %s: %s\n",
+			validation.SanitizeEmailBodyValue(key),
+			validation.SanitizeEmailBodyValue(value),
+		)
 	}
 
 	return metadataStr
diff --git a/internal/validation/headers.go b/internal/validation/headers.go
index 123ce0c..de6127d 100644
--- a/internal/validation/headers.go
+++ b/internal/validation/headers.go
@@ -3,6 +3,7 @@ package validation
 import (
 	"fmt"
 	"strings"
+	"unicode"
 )
 
 // ValidateHeaderValue rejects any value that contains characters capable of
@@ -34,3 +35,125 @@ func ValidateHeaderValue(field, value string) error {
 	}
 	return nil
 }
+
+// SanitizeEmailBodyValue scrubs control characters and visually-spoofable
+// Unicode from a single field that will be interpolated into a plaintext
+// email body. Closes CodeQL go/email-injection (CWE-640 / OWASP Content
+// Spoofing): an attacker who controls a field surfaced to an
+// operator-bound notification (cert subject DN, discovered cert metadata,
+// alert subject / message, event subject / body, metadata key+value
+// pairs) could otherwise plant content that:
+//
+//   - Forges header-like content using bare CR/LF (some mail relays
+//     misinterpret bare LF mid-body as a header boundary; RFC 5321
+//     mandates CRLF, but defense in depth says strip bare LFs).
+//   - Embeds NUL bytes (forbidden by RFC 5321 sec 4.5.2; some MTAs
+//     truncate at NUL, allowing content elision).
+//   - Plants bidi-override Unicode (U+202A..U+202E, U+2066..U+2069) so a
+//     malicious URL renders as a benign one in the recipient's mail
+//     client.
+//   - Plants zero-width / invisible Unicode (U+200B..U+200D, U+FEFF,
+//     U+2060..U+2063) so a phishing-prone URL hides whitespace.
+//   - Plants C0 / C1 control characters that mail clients may render
+//     unpredictably or strip in surprising ways.
+//
+// The sanitizer NEVER errors; it always returns a sanitized string. This
+// is the right contract for body content (vs. headers, which fail loud)
+// because dropping a notification because the cert subject DN happens to
+// contain a Mongolian Vowel Separator would be worse than escaping it.
+//
+// What the sanitizer does:
+//
+//   - Strip NUL bytes (\x00) entirely.
+//   - Replace bare LF / CR with a single space. Multi-line legitimate
+//     body content gets its CRLF formatting from the email serializer
+//     above this layer; a SINGLE FIELD interpolated into the body
+//     should never carry its own line breaks.
+//   - Strip bidi-override and zero-width characters.
+//   - Strip C0 control chars (< 0x20) except TAB. Strip DEL (0x7F) +
+//     C1 control chars (0x80-0x9F).
+//   - Leave ordinary printable Unicode (including non-Latin scripts)
+//     intact.
+//
+// Apply this to EVERY user-controllable field before interpolating into
+// a plaintext email body. Do NOT apply it to operator-controlled
+// constants (template literals, severity tier names) — those don't
+// carry the threat. The HTML email path uses html/template upstream
+// and does not need this sanitizer (html/template's contextual
+// auto-escape handles the same threats for HTML rendering).
+func SanitizeEmailBodyValue(value string) string {
+	if value == "" {
+		return value
+	}
+	var b strings.Builder
+	b.Grow(len(value))
+	for _, r := range value {
+		switch {
+		case r == 0:
+			// NUL — strip entirely (RFC 5321 sec 4.5.2 violation).
+			continue
+		case r == '\r' || r == '\n':
+			// Strip line breaks within a single interpolated field.
+			b.WriteRune(' ')
+		case r == '\t':
+			// TAB is legitimate body content.
+			b.WriteRune(r)
+		case r < 0x20:
+			// C0 control chars (except TAB above) — strip.
+			continue
+		case r >= 0x7F && r <= 0x9F:
+			// DEL + C1 control chars — strip.
+			continue
+		case r == 0xFFFD:
+			// Replacement character — Go's range emits this for any
+			// malformed UTF-8 byte sequence. Defense in depth: an
+			// attacker who plants invalid UTF-8 (e.g. raw 0x80..0xFF
+			// without a valid lead byte) should not have their input
+			// surface as an arbitrary glyph in operator-bound mail.
+			continue
+		case isBidiOrZeroWidth(r):
+			// Bidi-override + zero-width — strip; visually spoofable.
+			continue
+		case unicode.IsControl(r):
+			// Catch-all: any remaining Unicode control class.
+			continue
+		default:
+			b.WriteRune(r)
+		}
+	}
+	return b.String()
+}
+
+// isBidiOrZeroWidth reports whether r is one of the bidi-override or
+// zero-width Unicode codepoints used in homograph / direction-spoofing
+// attacks. Mirrors the validator in internal/connector/issuer/local
+// (validateCSRUnicode); kept inline here to avoid a new import edge
+// from internal/validation back to the local issuer package.
+//
+// Codepoints expressed as numeric ranges instead of rune-literal
+// switch cases — Go source rejects literal invisible characters
+// (e.g. BOM U+FEFF) mid-file, so we compare against numeric values.
+func isBidiOrZeroWidth(r rune) bool {
+	switch {
+	// LRE U+202A, RLE U+202B, PDF U+202C, LRO U+202D, RLO U+202E
+	case r >= 0x202A && r <= 0x202E:
+		return true
+	// LRI U+2066, RLI U+2067, FSI U+2068, PDI U+2069
+	case r >= 0x2066 && r <= 0x2069:
+		return true
+	// Zero-width space U+200B, ZWNJ U+200C, ZWJ U+200D
+	case r >= 0x200B && r <= 0x200D:
+		return true
+	// Word joiner U+2060, invisible separator U+2061,
+	// invisible times U+2062, invisible plus U+2063
+	case r >= 0x2060 && r <= 0x2063:
+		return true
+	// Byte-order mark / zero-width no-break space U+FEFF
+	case r == 0xFEFF:
+		return true
+	// Mongolian Vowel Separator U+180E
+	case r == 0x180E:
+		return true
+	}
+	return false
+}
diff --git a/internal/validation/headers_test.go b/internal/validation/headers_test.go
index c4f3121..c6bfa2d 100644
--- a/internal/validation/headers_test.go
+++ b/internal/validation/headers_test.go
@@ -68,3 +68,127 @@ func TestValidateHeaderValue_DefaultFieldName(t *testing.T) {
 		t.Errorf("expected default field name 'header' in error, got %q", err.Error())
 	}
 }
+
+// TestSanitizeEmailBodyValue_PreservesSafeInput pins the contract that
+// ordinary body content (including non-Latin scripts and tabs) flows
+// through unchanged. The sanitizer must be a no-op for legitimate input
+// — over-stripping degrades operator notifications.
+func TestSanitizeEmailBodyValue_PreservesSafeInput(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+	}{
+		{"plain ASCII", "Renewal reminder for prod.example.com"},
+		{"empty", ""},
+		{"utf-8 multibyte", "résumé — 日本語 — مرحبا"},
+		{"tabs allowed", "key:\tvalue"},
+		{"spaces", "  multiple  spaces  "},
+		{"common cert DN", "CN=api.example.com,O=Acme Corp,C=US"},
+		{"URL with safe chars", "https://docs.example.com/cert/mc-prod-api"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := SanitizeEmailBodyValue(tc.input)
+			if got != tc.input {
+				t.Errorf("expected unchanged %q, got %q", tc.input, got)
+			}
+		})
+	}
+}
+
+// TestSanitizeEmailBodyValue_StripsControlChars pins the CodeQL
+// go/email-injection (CWE-640) defense — every attacker-plant-able
+// control character is stripped or replaced.
+func TestSanitizeEmailBodyValue_StripsControlChars(t *testing.T) {
+	tests := []struct {
+		name      string
+		input     string
+		want      string
+		wantSafer bool // want output != input
+	}{
+		{"NUL byte stripped", "before\x00after", "beforeafter", true},
+		{"bare LF replaced with space", "line1\nline2", "line1 line2", true},
+		{"bare CR replaced with space", "line1\rline2", "line1 line2", true},
+		{"CRLF replaced (both stripped)", "line1\r\nline2", "line1  line2", true},
+		{"BEL stripped", "alert\x07now", "alertnow", true},
+		{"backspace stripped", "x\x08y", "xy", true},
+		{"DEL stripped", "x\x7fy", "xy", true},
+		// C1 control chars must be specified via Unicode escape (\u) so
+		// the source remains valid UTF-8; bare \x80 / \x9f bytes would
+		// be invalid UTF-8 and Go's range emits U+FFFD instead, which
+		// would test the malformed-UTF-8 strip path, not the C1 path.
+		{"C1 control char stripped (U+0080)", "x\u0080y", "xy", true},
+		{"C1 control char stripped (U+009F)", "x\u009Fy", "xy", true},
+		// U+FFFD is the replacement char Go emits for malformed UTF-8.
+		// We strip it as defense-in-depth so attacker-planted invalid
+		// UTF-8 doesn't survive into operator notifications as an
+		// arbitrary glyph.
+		{"replacement char stripped", "x\uFFFDy", "xy", true},
+		{"TAB preserved (legitimate body content)", "k:\tv", "k:\tv", false},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := SanitizeEmailBodyValue(tc.input)
+			if got != tc.want {
+				t.Errorf("input %q: want %q, got %q", tc.input, tc.want, got)
+			}
+			if tc.wantSafer && got == tc.input {
+				t.Errorf("expected sanitization to change %q, but output unchanged", tc.input)
+			}
+		})
+	}
+}
+
+// TestSanitizeEmailBodyValue_StripsBidiOverride pins the
+// visually-spoofable Unicode defense (homograph / RTL-override /
+// zero-width attacks). An attacker who controls a CN or metadata value
+// could otherwise plant a malicious URL that renders benignly in mail
+// clients that honor bidi-override codepoints.
+func TestSanitizeEmailBodyValue_StripsBidiOverride(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+	}{
+		// U+202E = Right-to-left override
+		{"RTL override", "Click ‮www.evil.com‬ to verify"},
+		// U+202D = Left-to-right override
+		{"LRO override", "Click ‭www.evil.com‬ to verify"},
+		// U+2066 = Left-to-right isolate
+		{"LRI isolate", "Click ⁦www.evil.com⁩ to verify"},
+		// U+200B = Zero-width space
+		{"zero-width space", "evil​.example.com"},
+		// U+200C = ZWNJ
+		{"zero-width non-joiner", "ad‌min@example.com"},
+		// U+FEFF = byte-order mark / zero-width no-break space
+		{"BOM", "x\uFEFFy"},
+		// U+180E = Mongolian Vowel Separator
+		{"MVS", "a᠎b"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := SanitizeEmailBodyValue(tc.input)
+			if got == tc.input {
+				t.Errorf("expected bidi/zero-width stripping for %q, got unchanged %q", tc.input, got)
+			}
+		})
+	}
+}
+
+// TestSanitizeEmailBodyValue_ContentSpoofingScenario pins the specific
+// CodeQL go/email-injection (CWE-640) example: an attacker who controls
+// a body field plants header-like content. The sanitizer neutralizes
+// the attempt by stripping bare LF/CR within the field.
+func TestSanitizeEmailBodyValue_ContentSpoofingScenario(t *testing.T) {
+	// Attacker plants a body value that tries to fake a "Reply-To"
+	// header inside the body. Even if mail clients don't honor it, a
+	// recipient skimming the body could be fooled.
+	attacker := "alert from compromised cert\r\nReply-To: attacker@evil.com\r\nClick https://evil.example.com/reset"
+	got := SanitizeEmailBodyValue(attacker)
+	if got == attacker {
+		t.Fatalf("attacker input passed through unchanged: %q", got)
+	}
+	// Specifically: no CR or LF should remain in the field.
+	if strings.ContainsAny(got, "\r\n") {
+		t.Errorf("CR/LF still present after sanitization: %q", got)
+	}
+}