rust-lang · Brace1000 · Apr 9, 2026 · Apr 9, 2026 · Apr 11, 2026 · Apr 11, 2026
diff --git a/tests/ui/README.md b/tests/ui/README.md
@@ -1582,6 +1582,21 @@ Tests on various well-formedness checks, e.g. [Type-checking normal functions](h
 
 Tests on `where` clauses. See [Where clauses | Reference](https://doc.rust-lang.org/reference/items/generics.html#where-clauses).
 
+## `tests/ui/whitespace/`
+
+Tests for whitespace handling in the Rust lexer. The Rust language
+defines whitespace as Unicode Pattern_White_Space, which is not the
+same as what the standard library gives you:
+
+- `is_ascii_whitespace` follows the WhatWG Infra Standard and skips
+  vertical tab (`\x0B`)
+- `is_whitespace` matches Unicode White_Space, which is a broader set
+
+These tests make that gap visible and check that the lexer accepts
+all 11 Pattern_White_Space characters correctly.
+
+See: https://github.com/rustfoundation/interop-initiative/issues/53
+
 ## `tests/ui/windows-subsystem/`: `#![windows_subsystem = ""]`
 
 See [the `windows_subsystem` attribute](https://doc.rust-lang.org/reference/runtime.html#the-windows_subsystem-attribute).

diff --git a/tests/ui/whitespace/ascii_whitespace_excludes_vertical_tab.rs b/tests/ui/whitespace/ascii_whitespace_excludes_vertical_tab.rs
@@ -0,0 +1,22 @@
+//@ run-pass
+// This test checks that split_ascii_whitespace does NOT split on
+// vertical tab (\x0B), because the standard library uses the WhatWG
+// Infra Standard definition of ASCII whitespace, which excludes
+// vertical tab.
+//
+// See: https://github.com/rust-lang/rust-project-goals/issues/53
+
+fn main() {
+    let s = "a\x0Bb";
+
+    let parts: Vec<&str> = s.split_ascii_whitespace().collect();
+
+    assert_eq!(parts.len(), 1,
+        "vertical tab should not be treated as ASCII whitespace");
+
+    let s2 = "a b";
+    let parts2: Vec<&str> = s2.split_ascii_whitespace().collect();
+    assert_eq!(parts2.len(), 2,
+        "regular space should split correctly");
+
+}
diff --git a/tests/ui/whitespace/invalid_whitespace.rs b/tests/ui/whitespace/invalid_whitespace.rs
@@ -0,0 +1,13 @@
+// This test ensures that the Rust lexer rejects invalid whitespace
+// characters such as ZERO WIDTH SPACE.
+
+//@ check-fail
+
+fn main() {
+    let x = 5;
+    let y = 10;
+
+    let a=x + y;
+    //~^ ERROR unknown start of token
+    //~| HELP invisible characters like
+}
diff --git a/tests/ui/whitespace/invalid_whitespace.stderr b/tests/ui/whitespace/invalid_whitespace.stderr
@@ -0,0 +1,10 @@
+error: unknown start of token: \u{200b}
+  --> $DIR/invalid_whitespace.rs:10:11
+   |
+LL |     let a=x + y;
+   |           ^
+   |
+   = help: invisible characters like '\u{200b}' are not usually visible in text editors
+
+error: aborting due to 1 previous error
+
diff --git a/tests/ui/whitespace/vertical_tab_lexer.rs b/tests/ui/whitespace/vertical_tab_lexer.rs
@@ -0,0 +1,58 @@
+//@ run-pass
+// ignore-tidy-tab
+//
+// Tests that the Rust lexer accepts Unicode Pattern_White_Space characters.
+//
+// Worth noting: the Rust reference defines whitespace as Pattern_White_Space,
+// which is not the same as what is_ascii_whitespace or is_whitespace give you.
+//
+// is_ascii_whitespace follows WhatWG and skips vertical tab (\x0B).
+// is_whitespace uses Unicode White_Space, which is a broader set.
+//
+// The 11 characters that actually count as whitespace in Rust source:
+//   \x09 \x0A \x0B \x0C \x0D \x20 \u{85} \u{200E} \u{200F} \u{2028} \u{2029}
+//
+// Ref: https://github.com/rustfoundation/interop-initiative/issues/53
+
+#[rustfmt::skip]
+fn main() {
+    // tab (\x09) between let and the name
+    let	_ws1 = 1_i32;
+
+    // vertical tab (\x0B) between let and the name
+    // this is the one is_ascii_whitespace gets wrong
+    let_ws2 = 2_i32;
+
+    // form feed (\x0C) between let and the name
+    let_ws3 = 3_i32;
+
+    // plain space (\x20), here just so every character is represented
+    let _ws4 = 4_i32;
+
+    // NEL (\u{85}) between let and the name
+    let_ws5 = 5_i32;
+
+    // left-to-right mark (\u{200E}) between let and the name
+    let‎_ws6 = 6_i32;
+
+    // right-to-left mark (\u{200F}) between let and the name
+    let‏_ws7 = 7_i32;
+
+    // \x0A, \x0D, \u{2028}, \u{2029} are also Pattern_White_Space but they
+    // act as line endings, so you can't stick them in the middle of a statement.
+    // The lexer still handles them correctly at line boundaries.
+
+    // These are Unicode White_Space but NOT Pattern_White_Space:
+    //   \u{A0}   no-break space       \u{1680} ogham space mark
+    //   \u{2000} en quad              \u{2001} em quad
+    //   \u{2002} en space             \u{2003} em space
+    //   \u{2004} three-per-em space   \u{2005} four-per-em space
+    //   \u{2006} six-per-em space     \u{2007} figure space
+    //   \u{2008} punctuation space    \u{2009} thin space
+    //   \u{200A} hair space           \u{202F} narrow no-break space
+    //   \u{205F} medium math space    \u{3000} ideographic space
+
+    // add them up so the compiler doesn't complain about unused variables
+    let _sum = _ws1 + _ws2 + _ws3 + _ws4 + _ws5 + _ws6 + _ws7;
+    println!("{}", _sum);
+}