certctl/internal/validation/unicode.go

// Copyright 2026 certctl LLC. All rights reserved.
// SPDX-License-Identifier: BUSL-1.1

package validation

import (
	"fmt"
	"strings"
	"unicode"
)

// Bundle-9 / Audit L-012 / CWE-1007 (Insufficient Visual Distinction of
// Homoglyphs Presenting to User) + CWE-176 (Improper Handling of Unicode
// Encoding):
//
// Certificate CommonName + Subject Alternative Name fields originate from
// the CSR submitter and feed directly into:
//
//   - The MCP / API surface that humans inspect ("which cert is this?")
//   - The web UI that renders cert lists, deployment targets, audit events
//   - Downstream relying parties that match certs by hostname
//
// An attacker who can submit a CSR (any operator with cert-create capability,
// or anonymous EST/SCEP enrollment) can plant unicode payloads that:
//
//  1. **Visually impersonate** a legitimate hostname via Cyrillic / Greek /
//     Cherokee homoglyphs (e.g. CN="apple.com" with one Cyrillic 'а' that
//     renders identically but routes differently via DNS or matches a
//     different TLS pin).
//
//  2. **Hide content** via zero-width characters (U+200B..U+200D, U+2060,
//     U+FEFF) that don't render but break naive substring matching.
//
//  3. **Reverse render order** via RTL/LTR override characters
//     (U+202A..U+202E, U+2066..U+2069) that make "google.com.evil.org"
//     display as "google.com.evil.org" with the suffix flipped.
//
// ValidateUnicodeSafe rejects all three categories. It does NOT NFC-normalize
// — the audit prompt's invariant is that the validator REJECTS rather than
// silently rewrites, because operators who don't know their CSR's CN was
// rewritten will get certs they didn't ask for.

// ValidateUnicodeSafe returns nil if `name` is safe to use as a certificate
// CN or SAN, or an error describing the first violation found. The error
// message includes the rune offset so operators can locate the problem in
// the CSR they submitted.
//
// Wired in: internal/connector/issuer/local/local.go (CSR-acceptance path).
// Future ride-along sites (M-029): the web frontend's CertificateStep input.
func ValidateUnicodeSafe(name string) error {
	if name == "" {
		// Empty is a different validation concern (handled by ValidateRequired
		// in handler-side ValidateRequired). Don't double-fail here.
		return nil
	}

	// First pass: scan for explicitly forbidden characters.
	for i, r := range name {
		switch {
		case isRTLOverride(r):
			return fmt.Errorf(
				"contains bidirectional override character %U at byte offset %d — refuse (potential reverse-rendering attack, CWE-1007)",
				r, i,
			)
		case isZeroWidth(r):
			return fmt.Errorf(
				"contains zero-width character %U at byte offset %d — refuse (hidden content, CWE-176)",
				r, i,
			)
		case isControl(r):
			return fmt.Errorf(
				"contains control character %U at byte offset %d — refuse",
				r, i,
			)
		}
	}

	// Second pass: per-label mixed-script detection. DNS labels are joined
	// by '.', so we split on '.' and check each label independently. A
	// label that mixes Latin with Cyrillic / Greek / Cherokee is the
	// classic IDN homograph signal.
	for _, label := range strings.Split(name, ".") {
		if err := validateLabelSingleScript(label); err != nil {
			return err
		}
	}

	return nil
}

// isRTLOverride reports whether r is a Unicode bidirectional override
// character that an attacker could use to flip rendered text direction.
func isRTLOverride(r rune) bool {
	switch r {
	case 0x202A, // LEFT-TO-RIGHT EMBEDDING
		0x202B, // RIGHT-TO-LEFT EMBEDDING
		0x202C, // POP DIRECTIONAL FORMATTING
		0x202D, // LEFT-TO-RIGHT OVERRIDE
		0x202E, // RIGHT-TO-LEFT OVERRIDE
		0x2066, // LEFT-TO-RIGHT ISOLATE
		0x2067, // RIGHT-TO-LEFT ISOLATE
		0x2068, // FIRST STRONG ISOLATE
		0x2069: // POP DIRECTIONAL ISOLATE
		return true
	}
	return false
}

// isZeroWidth reports whether r is a Unicode zero-width character that
// renders nothing but breaks substring matching.
func isZeroWidth(r rune) bool {
	switch r {
	case 0x200B, // ZERO WIDTH SPACE
		0x200C, // ZERO WIDTH NON-JOINER
		0x200D, // ZERO WIDTH JOINER
		0x2060, // WORD JOINER
		0xFEFF: // ZERO WIDTH NO-BREAK SPACE / BOM
		return true
	}
	return false
}

// isControl reports whether r is a C0 or C1 control character. Tabs and
// newlines have no business in a certificate name; reject.
func isControl(r rune) bool {
	return r < 0x20 || (r >= 0x7F && r <= 0x9F)
}

// validateLabelSingleScript rejects a DNS label that mixes Latin
// (a–z, A–Z, 0–9, '-') with characters from a different script. Pure-
// non-Latin labels are allowed (e.g. genuine IDN domains in Cyrillic);
// the attack we're defending against is the MIX.
func validateLabelSingleScript(label string) error {
	if label == "" {
		return nil
	}
	hasASCII := false
	for _, r := range label {
		if r < 0x80 {
			hasASCII = true
			break
		}
	}
	if !hasASCII {
		// Pure non-ASCII label — could be a legitimate IDN. Don't
		// reject; the homograph attack we care about is the MIX.
		return nil
	}
	// Has ASCII — assert NO non-ASCII letters present. Non-ASCII
	// non-letter chars (e.g., a digit from a different script) are
	// also rejected to keep the rule simple.
	for i, r := range label {
		if r < 0x80 {
			continue
		}
		if unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsMark(r) {
			return fmt.Errorf(
				"label %q mixes ASCII with non-ASCII script character %U at byte offset %d — refuse (potential IDN homograph, CWE-1007)",
				label, r, i,
			)
		}
	}
	return nil
}