Files
certctl/internal/validation/unicode.go
T
shankar0123 21aeed4f4e legal: addlicense headers + normalize legacy variants (Phase 0 RED-4)
Phase 0 closure (Path B2, post-rewrite):

addlicense sweep — adds the canonical certctl LLC copyright + BUSL-1.1
SPDX header to every production Go file. Template:

  // Copyright 2026 certctl LLC. All rights reserved.
  // SPDX-License-Identifier: BUSL-1.1

Coverage: 338 / 338 production Go files (cmd/ + internal/, excluding
*_test.go and **/testdata/**). Pre-sweep coverage was 22 / 338 (6.5%);
post-sweep is 338 / 338 (100%).

Normalized 22 pre-existing legacy headers (`// Copyright (c) certctl`
+ `// SPDX-License-Identifier: BSL-1.1`) and 1 file using a
`Certctl Contributors` attribution. The legacy SPDX ID `BSL-1.1`
is non-standard; the official SPDX identifier for Business Source
License 1.1 is `BUSL-1.1` (capital U). All 338 files now share the
canonical form.

Generated via:
  addlicense -c "certctl LLC" -y 2026 \
    -f cowork/legal/copyright-header.tpl \
    -ignore '**/testdata/**' -ignore '**/*_test.go' \
    cmd/ internal/

Verification:
  find cmd internal -name '*.go' -not -name '*_test.go' \
    -not -path '*/testdata/*' \
    -exec grep -L '^// Copyright 2026 certctl LLC' {} \; | wc -l

  Returns: 0

gofmt clean. Header additions are comments only, no compile impact.

Closes: cowork/certctl-architecture-diligence-audit.html#fix-RED-4
2026-05-13 21:23:35 +00:00

165 lines
5.4 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright 2026 certctl LLC. All rights reserved.
// SPDX-License-Identifier: BUSL-1.1
package validation
import (
"fmt"
"strings"
"unicode"
)
// Bundle-9 / Audit L-012 / CWE-1007 (Insufficient Visual Distinction of
// Homoglyphs Presenting to User) + CWE-176 (Improper Handling of Unicode
// Encoding):
//
// Certificate CommonName + Subject Alternative Name fields originate from
// the CSR submitter and feed directly into:
//
// - The MCP / API surface that humans inspect ("which cert is this?")
// - The web UI that renders cert lists, deployment targets, audit events
// - Downstream relying parties that match certs by hostname
//
// An attacker who can submit a CSR (any operator with cert-create capability,
// or anonymous EST/SCEP enrollment) can plant unicode payloads that:
//
// 1. **Visually impersonate** a legitimate hostname via Cyrillic / Greek /
// Cherokee homoglyphs (e.g. CN="apple.com" with one Cyrillic 'а' that
// renders identically but routes differently via DNS or matches a
// different TLS pin).
//
// 2. **Hide content** via zero-width characters (U+200B..U+200D, U+2060,
// U+FEFF) that don't render but break naive substring matching.
//
// 3. **Reverse render order** via RTL/LTR override characters
// (U+202A..U+202E, U+2066..U+2069) that make "google.com.evil.org"
// display as "google.com.evil.org" with the suffix flipped.
//
// ValidateUnicodeSafe rejects all three categories. It does NOT NFC-normalize
// — the audit prompt's invariant is that the validator REJECTS rather than
// silently rewrites, because operators who don't know their CSR's CN was
// rewritten will get certs they didn't ask for.
// ValidateUnicodeSafe returns nil if `name` is safe to use as a certificate
// CN or SAN, or an error describing the first violation found. The error
// message includes the rune offset so operators can locate the problem in
// the CSR they submitted.
//
// Wired in: internal/connector/issuer/local/local.go (CSR-acceptance path).
// Future ride-along sites (M-029): the web frontend's CertificateStep input.
func ValidateUnicodeSafe(name string) error {
if name == "" {
// Empty is a different validation concern (handled by ValidateRequired
// in handler-side ValidateRequired). Don't double-fail here.
return nil
}
// First pass: scan for explicitly forbidden characters.
for i, r := range name {
switch {
case isRTLOverride(r):
return fmt.Errorf(
"contains bidirectional override character %U at byte offset %d — refuse (potential reverse-rendering attack, CWE-1007)",
r, i,
)
case isZeroWidth(r):
return fmt.Errorf(
"contains zero-width character %U at byte offset %d — refuse (hidden content, CWE-176)",
r, i,
)
case isControl(r):
return fmt.Errorf(
"contains control character %U at byte offset %d — refuse",
r, i,
)
}
}
// Second pass: per-label mixed-script detection. DNS labels are joined
// by '.', so we split on '.' and check each label independently. A
// label that mixes Latin with Cyrillic / Greek / Cherokee is the
// classic IDN homograph signal.
for _, label := range strings.Split(name, ".") {
if err := validateLabelSingleScript(label); err != nil {
return err
}
}
return nil
}
// isRTLOverride reports whether r is a Unicode bidirectional override
// character that an attacker could use to flip rendered text direction.
func isRTLOverride(r rune) bool {
switch r {
case 0x202A, // LEFT-TO-RIGHT EMBEDDING
0x202B, // RIGHT-TO-LEFT EMBEDDING
0x202C, // POP DIRECTIONAL FORMATTING
0x202D, // LEFT-TO-RIGHT OVERRIDE
0x202E, // RIGHT-TO-LEFT OVERRIDE
0x2066, // LEFT-TO-RIGHT ISOLATE
0x2067, // RIGHT-TO-LEFT ISOLATE
0x2068, // FIRST STRONG ISOLATE
0x2069: // POP DIRECTIONAL ISOLATE
return true
}
return false
}
// isZeroWidth reports whether r is a Unicode zero-width character that
// renders nothing but breaks substring matching.
func isZeroWidth(r rune) bool {
switch r {
case 0x200B, // ZERO WIDTH SPACE
0x200C, // ZERO WIDTH NON-JOINER
0x200D, // ZERO WIDTH JOINER
0x2060, // WORD JOINER
0xFEFF: // ZERO WIDTH NO-BREAK SPACE / BOM
return true
}
return false
}
// isControl reports whether r is a C0 or C1 control character. Tabs and
// newlines have no business in a certificate name; reject.
func isControl(r rune) bool {
return r < 0x20 || (r >= 0x7F && r <= 0x9F)
}
// validateLabelSingleScript rejects a DNS label that mixes Latin
// (az, AZ, 09, '-') with characters from a different script. Pure-
// non-Latin labels are allowed (e.g. genuine IDN domains in Cyrillic);
// the attack we're defending against is the MIX.
func validateLabelSingleScript(label string) error {
if label == "" {
return nil
}
hasASCII := false
for _, r := range label {
if r < 0x80 {
hasASCII = true
break
}
}
if !hasASCII {
// Pure non-ASCII label — could be a legitimate IDN. Don't
// reject; the homograph attack we care about is the MIX.
return nil
}
// Has ASCII — assert NO non-ASCII letters present. Non-ASCII
// non-letter chars (e.g., a digit from a different script) are
// also rejected to keep the rule simple.
for i, r := range label {
if r < 0x80 {
continue
}
if unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsMark(r) {
return fmt.Errorf(
"label %q mixes ASCII with non-ASCII script character %U at byte offset %d — refuse (potential IDN homograph, CWE-1007)",
label, r, i,
)
}
}
return nil
}