Skip to content

Commit caa4205

Browse files
authored
Merge pull request #2196 from rust-lang/TC/support-nonascii-in-grammar-rule-names
Support non-ASCII Unicode in grammar rule names
2 parents 2dfed35 + 93a2336 commit caa4205

File tree

2 files changed

+28
-12
lines changed

2 files changed

+28
-12
lines changed

tools/grammar/src/parser.rs

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,24 @@ macro_rules! bail {
3838

3939
type Result<T> = std::result::Result<T, Error>;
4040

41+
/// Whether a character can start a grammar rule name.
42+
///
43+
/// This includes ASCII alphabetic characters, underscores, and
44+
/// non-ASCII Unicode symbols such as `⊥` (bottom) and `⊤` (top).
45+
/// ASCII symbols are excluded because characters such as `+`, `|`,
46+
/// `~`, and `^` are grammar syntax.
47+
fn is_name_start(ch: char) -> bool {
48+
ch.is_alphabetic() || ch == '_' || !ch.is_ascii()
49+
}
50+
51+
/// Whether a character can continue a grammar rule name.
52+
///
53+
/// Accepts alphanumeric characters, underscores, and non-ASCII
54+
/// characters.
55+
fn is_name_continue(ch: char) -> bool {
56+
ch.is_alphanumeric() || ch == '_' || !ch.is_ascii()
57+
}
58+
4159
pub fn parse_grammar(
4260
input: &str,
4361
grammar: &mut Grammar,
@@ -152,18 +170,11 @@ impl Parser<'_> {
152170
}
153171

154172
fn parse_name(&mut self) -> Option<String> {
155-
// Names must start with an alphabetic character or
156-
// underscore.
157173
let first = self.input[self.index..].chars().next()?;
158-
if !first.is_alphabetic() && first != '_' {
174+
if !is_name_start(first) {
159175
return None;
160176
}
161-
let name = self.take_while(&|c: char| c.is_alphanumeric() || c == '_');
162-
if name.is_empty() {
163-
None
164-
} else {
165-
Some(name.to_string())
166-
}
177+
Some(self.take_while(&|c| is_name_continue(c)).to_string())
167178
}
168179

169180
fn parse_expression(&mut self) -> Result<Option<Expression>> {
@@ -231,7 +242,7 @@ impl Parser<'_> {
231242
} else if self.input[self.index..]
232243
.chars()
233244
.next()
234-
.map(|ch| ch.is_alphanumeric())
245+
.map(|ch| is_name_start(ch))
235246
.unwrap_or(false)
236247
{
237248
self.parse_nonterminal()

tools/mdbook-spec/src/grammar.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,13 @@ use std::sync::LazyLock;
1111
mod render_markdown;
1212
mod render_railroad;
1313

14-
static NAMES_RE: LazyLock<Regex> =
15-
LazyLock::new(|| Regex::new(r"(?m)^(?:@root )?([A-Za-z0-9_]+)(?: \([^)]+\))? ->").unwrap());
14+
static NAMES_RE: LazyLock<Regex> = LazyLock::new(|| {
15+
// For match rule names, we support standard ASCII identifiers
16+
// or non-ASCII characters (such as `⊥`). This must be
17+
// kept in sync with `is_name_start` and `is_name_continue` in
18+
// `tools/grammar/src/parser.rs`.
19+
Regex::new(r"(?m)^(?:@root )?([A-Za-z0-9_]+|[^\x00-\x7F])(?: \([^)]+\))? ->").unwrap()
20+
});
1621

1722
#[derive(Debug)]
1823
pub struct RenderCtx {

0 commit comments

Comments
 (0)