From 5ee819cd6136c712083df2bb58bf013efc9f4758 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 07:45:22 -0800 Subject: [PATCH 01/12] Add negative lookahead to the grammar This adds the `!` prefix which represents negative lookahead. This was included in the original PEG paper, though it was called "NOT", whereas I went with a more explicit "NegativeLookahead". This will be helpful in several productions which need to have these kinds of exclusions. The syntax is also commonly used in regular expression engines which usually use `(?!expr)`. This is also common in many other PEG libraries. There is a small risk this could be confusing, since `!` is sometimes used for other purposes in other contexts. For example, Prolog uses `!` for their cut operator. I think this should be fine since it is common with PEG. --- dev-guide/src/grammar.md | 7 ++++++- src/notation.md | 1 + tools/grammar/src/lib.rs | 3 +++ tools/grammar/src/parser.rs | 11 +++++++++++ tools/mdbook-spec/src/grammar/render_markdown.rs | 5 +++++ tools/mdbook-spec/src/grammar/render_railroad.rs | 6 ++++++ 6 files changed, 32 insertions(+), 1 deletion(-) diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 40e4883096..b5be7aa144 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -39,7 +39,11 @@ Sequence -> (` `* AdornedExpr)* ` `* Cut | (` `* AdornedExpr)+ -AdornedExpr -> Expr1 Quantifier? Suffix? Footnote? +AdornedExpr -> Prefix? Expr1 Quantifier? Suffix? Footnote? + +Prefix -> NegativeLookahead + +NegativeLookahead -> `!` Suffix -> ` _` * `_` @@ -135,6 +139,7 @@ The general format is a series of productions separated by blank lines. The expr | Suffix | \_except \[LazyBooleanExpression\]\_ | Adds a suffix to the previous expression to provide an additional English description, rendered in subscript. This can contain limited Markdown, but try to avoid anything except basics like links. | | Footnote | \[^extern-safe\] | Adds a footnote, which can supply extra information that may be helpful to the user. The footnote itself should be defined outside of the code block like a normal Markdown footnote. | | Optional | Expr? | The preceding expression is optional. | +| NegativeLookahead | !Expr | Matches if Expr does not follow, without consuming any input. | | Repeat | Expr* | The preceding expression is repeated 0 or more times. | | RepeatNonGreedy | Expr*? | The preceding expression is repeated 0 or more times without being greedy. | | RepeatPlus | Expr+ | The preceding expression is repeated 1 or more times. | diff --git a/src/notation.md b/src/notation.md index 850ee9fb5e..ace8e65bc0 100644 --- a/src/notation.md +++ b/src/notation.md @@ -20,6 +20,7 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets: | xa..=b | HEX_DIGIT1..=5 | a to b repetitions of x, inclusive of b | | Rule1 Rule2 | `fn` _Name_ _Parameters_ | Sequence of rules in order | | \| | `u8` \| `u16`, Block \| Item | Either one or another | +| ! | !COMMENT | Matches if the expression does not follow, without consuming any input | | \[ ] | \[`b` `B`] | Any of the characters listed | | \[ - ] | \[`a`-`z`] | Any of the characters in the range | | ~\[ ] | ~\[`b` `B`] | Any characters, except those listed | diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 6fbb886558..986019270b 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -51,6 +51,8 @@ pub enum ExpressionKind { Sequence(Vec), /// `A?` Optional(Box), + /// `!A` + NegativeLookahead(Box), /// `A*` Repeat(Box), /// `A*?` @@ -137,6 +139,7 @@ impl Expression { match &self.kind { ExpressionKind::Grouped(e) | ExpressionKind::Optional(e) + | ExpressionKind::NegativeLookahead(e) | ExpressionKind::Repeat(e) | ExpressionKind::RepeatNonGreedy(e) | ExpressionKind::RepeatPlus(e) diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index f65cb80f97..48c271d255 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -251,6 +251,8 @@ impl Parser<'_> { self.parse_grouped()? } else if next == b'~' { self.parse_neg_expression()? + } else if next == b'!' { + self.parse_negative_lookahead()? } else { return Ok(None); }; @@ -387,6 +389,15 @@ impl Parser<'_> { Ok(ExpressionKind::NegExpression(box_kind(kind))) } + fn parse_negative_lookahead(&mut self) -> Result { + self.expect("!", "expected !")?; + self.space0(); + let Some(e) = self.parse_expr1()? else { + bail!(self, "expected expression after !"); + }; + Ok(ExpressionKind::NegativeLookahead(Box::new(e))) + } + /// Parse e.g. `F00F` after `U+`. fn parse_unicode(&mut self) -> Result { let mut xs = Vec::with_capacity(4); diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index edec8da035..59045fc503 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -67,6 +67,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => last_expr(es.last().unwrap()), ExpressionKind::Grouped(_) | ExpressionKind::Optional(_) + | ExpressionKind::NegativeLookahead(_) | ExpressionKind::Repeat(_) | ExpressionKind::RepeatNonGreedy(_) | ExpressionKind::RepeatPlus(_) @@ -119,6 +120,10 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { render_expression(e, cx, output); output.push_str("?"); } + ExpressionKind::NegativeLookahead(e) => { + output.push('!'); + render_expression(e, cx, output); + } ExpressionKind::Repeat(e) => { render_expression(e, cx, output); output.push_str("\\*"); diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index ebb20af1bc..57a440769c 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -143,6 +143,12 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option { + let forward = render_expression(e, cx, stack)?; + let lbox = + LabeledBox::new(forward, Comment::new("not followed by".to_string())); + Box::new(lbox) + } // Treat `e?` and `e{..=1}` / `e{0..=1}` equally. ExpressionKind::Optional(e) | ExpressionKind::RepeatRange { From 2a7c3500a79b234e16c81321214c7321df868a99 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 08:22:17 -0800 Subject: [PATCH 02/12] Add Unicode to character range This adds the ability to specify Unicode code points in a character range. This will be useful for defining some productions without using English, and perhaps to be a little clearer. This also extends the Unicode grammar to allow up to 6 characters for larger code points. --- dev-guide/src/grammar.md | 10 ++- src/input-format.md | 4 +- tools/grammar/src/lib.rs | 31 ++++++++- tools/grammar/src/parser.rs | 66 ++++++++++++------- .../src/grammar/render_markdown.rs | 24 ++++--- .../src/grammar/render_railroad.rs | 16 ++++- 6 files changed, 111 insertions(+), 40 deletions(-) diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index b5be7aa144..2dbdcf1ab4 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -85,7 +85,7 @@ Expr1 -> | Group | NegativeExpression -Unicode -> `U+` [`A`-`Z` `0`-`9`]4..=4 +Unicode -> `U+` [`A`-`Z` `0`-`9`]4..=6 NonTerminal -> Name @@ -102,7 +102,11 @@ Characters -> | CharacterTerminal | CharacterName -CharacterRange -> BACKTICK BACKTICK `-` BACKTICK BACKTICK +CharacterRange -> Character `-` Character + +Character -> + BACKTICK BACKTICK + | Unicode CharacterTerminal -> Terminal @@ -127,7 +131,7 @@ The general format is a series of productions separated by blank lines. The expr | Comment | // Single line comment. | A comment extending to the end of the line. | | Terminal | \`example\` | A sequence of exact characters, surrounded by backticks. | | Charset | \[ \`A\`-\`Z\` \`0\`-\`9\` \`_\` \] | A choice from a set of characters, space-separated. There are three different forms. | -| CharacterRange | \[ \`A\`-\`Z\` \] | A range of characters; each character should be in backticks. | +| CharacterRange | \[ \`A\`-\`Z\` \] | A range of characters. Characters can be a Unicode expression or be a literal character surrounded in backticks. | | CharacterTerminal | \[ \`x\` \] | A single character, surrounded by backticks. | | CharacterName | \[ LF \] | A nonterminal, referring to another production. | | Prose | \ | An English description of what should be matched, surrounded in angle brackets. | diff --git a/src/input-format.md b/src/input-format.md index 2432da0339..be6bb670b3 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -3,7 +3,9 @@ r[input] r[input.syntax] ```grammar,lexer -@root CHAR -> +@root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value + +@root ASCII -> [U+0000-U+007F] NUL -> U+0000 ``` diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 986019270b..1d64e45143 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -87,7 +87,7 @@ pub enum ExpressionKind { /// `^ A B C` Cut(Box), /// `U+0060` - Unicode(String), + Unicode((char, String)), } #[derive(Copy, Clone, Debug)] @@ -115,7 +115,34 @@ pub enum Characters { /// `` `_` `` Terminal(String), /// `` `A`-`Z` `` - Range(char, char), + Range(Character, Character), +} + +#[derive(Clone, Debug)] +pub enum Character { + Char(char), + /// `U+0060` + /// + /// The `String` is the hex digits after `U+`. + Unicode((char, String)), +} + +impl Character { + pub fn get_ch(&self) -> char { + match self { + Character::Char(ch) => *ch, + Character::Unicode((ch, _)) => *ch, + } + } +} + +impl Display for Character { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Character::Char(ch) => write!(f, "`{ch}`"), + Character::Unicode((_, s)) => write!(f, "U+{s}"), + } + } } impl Grammar { diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index 48c271d255..080d03c66e 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -1,6 +1,6 @@ //! A parser of the ENBF-like grammar. -use super::{Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit}; +use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit}; use std::fmt; use std::fmt::Display; use std::path::Path; @@ -221,7 +221,7 @@ impl Parser<'_> { }; let kind = if self.take_str("U+") { - self.parse_unicode()? + ExpressionKind::Unicode(self.parse_unicode()?) } else if self.input[self.index..] .chars() .next() @@ -322,27 +322,19 @@ impl Parser<'_> { /// Parse an element of a character class, e.g. /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``. fn parse_characters(&mut self) -> Result> { - if let Some(b'`') = self.peek() { - let recov = self.index; - let a = self.parse_terminal_str()?; + if let Some(a) = self.parse_character()? { if self.take_str("-") { - //~^ Parse `` `a`-`b` `` character range. - if a.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid start terminal in range"); - } - let recov = self.index; - let b = self.parse_terminal_str()?; - if b.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid end terminal in range"); - } - let a = a.chars().next().unwrap(); - let b = b.chars().next().unwrap(); + let Some(b) = self.parse_character()? else { + bail!(self, "expected character in range"); + }; Ok(Some(Characters::Range(a, b))) } else { //~^ Parse terminal in backticks. - Ok(Some(Characters::Terminal(a))) + let t = match a { + Character::Char(ch) => ch.to_string(), + Character::Unicode(_) => bail!(self, "unicode not supported"), + }; + Ok(Some(Characters::Terminal(t))) } } else if let Some(name) = self.parse_name() { //~^ Parse nonterminal identifier. @@ -352,6 +344,23 @@ impl Parser<'_> { } } + fn parse_character(&mut self) -> Result> { + if let Some(b'`') = self.peek() { + let recov = self.index; + let term = self.parse_terminal_str()?; + if term.len() > 1 { + self.index = recov + 1; + bail!(self, "invalid start terminal in range"); + } + let ch = term.chars().next().unwrap(); + Ok(Some(Character::Char(ch))) + } else if self.take_str("U+") { + Ok(Some(Character::Unicode(self.parse_unicode()?))) + } else { + Ok(None) + } + } + /// Parse e.g. ``. fn parse_prose(&mut self) -> Result { self.expect("<", "expected opening `<`")?; @@ -399,9 +408,9 @@ impl Parser<'_> { } /// Parse e.g. `F00F` after `U+`. - fn parse_unicode(&mut self) -> Result { - let mut xs = Vec::with_capacity(4); - for _ in 0..4 { + fn parse_unicode(&mut self) -> Result<(char, String)> { + let mut xs = Vec::with_capacity(6); + let mut push_next = || { match self.peek() { Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => { xs.push(x); @@ -409,8 +418,19 @@ impl Parser<'_> { } _ => bail!(self, "expected 4 uppercase hexadecimal digits after `U+`"), } + Ok(()) + }; + for _ in 0..4 { + push_next()?; + } + for _ in 0..2 { + if push_next().is_err() { + break; + } } - Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap())) + let s = String::from_utf8(xs).unwrap(); + let ch = char::from_u32(u32::from_str_radix(&s, 16).unwrap()).unwrap(); + Ok((ch, s)) } /// Parse `?` after expression. diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index 59045fc503..7f9624fc86 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production}; use regex::Regex; use std::borrow::Cow; use std::fmt::Write; @@ -186,7 +186,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { output.push_str("^ "); render_expression(e, cx, output); } - ExpressionKind::Unicode(s) => { + ExpressionKind::Unicode((_, s)) => { output.push_str("U+"); output.push_str(s); } @@ -227,12 +227,20 @@ fn render_characters(chars: &Characters, cx: &RenderCtx, output: &mut String) { markdown_escape(s) ) .unwrap(), - Characters::Range(a, b) => write!( - output, - "{a}\ - -{b}" - ) - .unwrap(), + Characters::Range(a, b) => { + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => write!( + output, + "{}", + markdown_escape(&ch.to_string()) + ) + .unwrap(), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, output); + output.push('-'); + write_ch(b, output); + } } } diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index 57a440769c..1543127889 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production, RangeLimit}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production, RangeLimit}; use railroad::*; use regex::Regex; use std::fmt::Write; @@ -298,7 +298,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option Box::new(Terminal::new(format!("U+{}", s))), + ExpressionKind::Unicode((_, s)) => Box::new(Terminal::new(format!("U+{}", s))), }; } }; @@ -317,7 +317,17 @@ fn render_characters(chars: &Characters, cx: &RenderCtx) -> Box { match chars { Characters::Named(s) => node_for_nt(cx, s), Characters::Terminal(s) => Box::new(Terminal::new(s.clone())), - Characters::Range(a, b) => Box::new(Terminal::new(format!("{a}-{b}"))), + Characters::Range(a, b) => { + let mut s = String::new(); + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => output.push(*ch), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, &mut s); + s.push('-'); + write_ch(b, &mut s); + Box::new(Terminal::new(s)) + } } } From 999f8839e3ae152906c1070344bdb1a802072c81 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:26:19 -0800 Subject: [PATCH 03/12] Use negative lookahead in the grammar This replaces some suffixes and prose with the new negative lookahead syntax instead. This should all have the same meaning. --- src/identifiers.md | 2 +- src/input-format.md | 2 +- src/tokens.md | 30 +++++++++++++----------------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/identifiers.md b/src/identifiers.md index 979284a1c7..5abe303ca2 100644 --- a/src/identifiers.md +++ b/src/identifiers.md @@ -16,7 +16,7 @@ NON_KEYWORD_IDENTIFIER -> IDENTIFIER_OR_KEYWORD _except a [strict][lex.keywords. IDENTIFIER -> NON_KEYWORD_IDENTIFIER | RAW_IDENTIFIER RESERVED_RAW_IDENTIFIER -> - `r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by XID_Continue_ + `r#` (`_` | `crate` | `self` | `Self` | `super`) !XID_Continue ``` diff --git a/src/input-format.md b/src/input-format.md index be6bb670b3..d6eca2dc2d 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -5,7 +5,7 @@ r[input.syntax] ```grammar,lexer @root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value -@root ASCII -> [U+0000-U+007F] +ASCII -> [U+0000-U+007F] NUL -> U+0000 ``` diff --git a/src/tokens.md b/src/tokens.md index 047afd76a6..65b000ce60 100644 --- a/src/tokens.md +++ b/src/tokens.md @@ -115,7 +115,7 @@ r[lex.token.literal.suffix.syntax] ```grammar,lexer SUFFIX -> IDENTIFIER_OR_KEYWORD _except `_`_ -SUFFIX_NO_E -> SUFFIX _not beginning with `e` or `E`_ +SUFFIX_NO_E -> ![`e` `E`] SUFFIX ``` r[lex.token.literal.suffix.validity] @@ -253,8 +253,7 @@ r[lex.token.byte.syntax] BYTE_LITERAL -> `b'` ^ ( ASCII_FOR_CHAR | BYTE_ESCAPE ) `'` SUFFIX? -ASCII_FOR_CHAR -> - +ASCII_FOR_CHAR -> ![`'` `\` LF CR TAB] ASCII BYTE_ESCAPE -> `\x` HEX_DIGIT HEX_DIGIT @@ -272,8 +271,7 @@ r[lex.token.str-byte.syntax] BYTE_STRING_LITERAL -> `b"` ^ ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX? -ASCII_FOR_STRING -> - +ASCII_FOR_STRING -> ![`"` `\` CR] ASCII ``` r[lex.token.str-byte.intro] @@ -309,8 +307,7 @@ RAW_BYTE_STRING_CONTENT -> `"` ^ ASCII_FOR_RAW*? `"` | `#` RAW_BYTE_STRING_CONTENT `#` -ASCII_FOR_RAW -> - +ASCII_FOR_RAW -> !CR ASCII ``` r[lex.token.str-byte-raw.intro] @@ -559,7 +556,7 @@ r[lex.token.literal.float.syntax] FLOAT_LITERAL -> DEC_LITERAL (`.` DEC_LITERAL)? FLOAT_EXPONENT SUFFIX? | DEC_LITERAL `.` DEC_LITERAL SUFFIX_NO_E? - | DEC_LITERAL `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | DEC_LITERAL `.` !(`.` | `_` | XID_Start) FLOAT_EXPONENT -> (`e`|`E`) (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)* @@ -608,13 +605,12 @@ r[lex.token.literal.reserved.syntax] RESERVED_NUMBER -> BIN_LITERAL [`2`-`9`] | OCT_LITERAL [`8`-`9`] - | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` !(`.` | `_` | XID_Start) | ( BIN_LITERAL | OCT_LITERAL ) (`e`|`E`) - | `0b` `_`* - | `0o` `_`* - | `0x` `_`* + | `0b` `_`* !BIN_DIGIT + | `0o` `_`* !OCT_DIGIT + | `0x` `_`* !HEX_DIGIT | DEC_LITERAL ( `.` DEC_LITERAL )? (`e` | `E`) (`+` | `-`)? - ``` r[lex.token.literal.reserved.intro] @@ -657,16 +653,16 @@ r[lex.token.life.syntax] ```grammar,lexer LIFETIME_TOKEN -> RAW_LIFETIME - | `'` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + | `'` IDENTIFIER_OR_KEYWORD !`'` LIFETIME_OR_LABEL -> RAW_LIFETIME - | `'` NON_KEYWORD_IDENTIFIER _not immediately followed by `'`_ + | `'` NON_KEYWORD_IDENTIFIER !`'` RAW_LIFETIME -> - `'r#` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + `'r#` ^ IDENTIFIER_OR_KEYWORD !`'` -RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by `'`_ +RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) !(`'` | XID_Continue) ``` r[lex.token.life.intro] From cc7025c5b439b24091ab4406c14ee8bf51725e8b Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:30:07 -0800 Subject: [PATCH 04/12] Fix LINE_COMMENT grammar This clarifies that bare `//` is explicitly meant to be either followed by LF or EOF. Otherwise it incorrectly matches other comment rules. --- src/comments.md | 3 ++- src/input-format.md | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/comments.md b/src/comments.md index a240e7dc58..1320077938 100644 --- a/src/comments.md +++ b/src/comments.md @@ -5,7 +5,8 @@ r[comments.syntax] ```grammar,lexer @root LINE_COMMENT -> `//` (~[`/` `!` LF] | `//`) ~LF* - | `//` + | `//` EOF + | `//` _immediately followed by LF_ BLOCK_COMMENT -> `/*` diff --git a/src/input-format.md b/src/input-format.md index d6eca2dc2d..3e35cba1ee 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -3,11 +3,13 @@ r[input] r[input.syntax] ```grammar,lexer -@root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value +CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value ASCII -> [U+0000-U+007F] NUL -> U+0000 + +EOF -> !CHAR // End of file or input ``` r[input.intro] From 844b827ebb8390c38ece4e97df3a478f5da34438 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:35:57 -0800 Subject: [PATCH 05/12] Fix BLOCK_COMMENT order This fixes the BLOCK_COMMENT grammar so that it follows the rule that the first alternation that matches wins. The previous grammar would fail with the use of the cut operator to parse these two forms. --- src/comments.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/comments.md b/src/comments.md index 1320077938..e82cd28ace 100644 --- a/src/comments.md +++ b/src/comments.md @@ -9,12 +9,13 @@ r[comments.syntax] | `//` _immediately followed by LF_ BLOCK_COMMENT -> - `/*` + `/**/` + | `/***/` + | `/*` + ^ ( ~[`*` `!`] | `**` | BLOCK_COMMENT_OR_DOC ) ( BLOCK_COMMENT_OR_DOC | ~`*/` )* `*/` - | `/**/` - | `/***/` @root INNER_LINE_DOC -> `//!` ~[LF CR]* From 20f26498e5b6cab21a3688e460de0d2d9d7defa1 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:54:48 -0800 Subject: [PATCH 06/12] Fix handling of carriage returns in doc comments This fixes the doc comments so that they properly handle a carriage return by using the cut operator. Rustc will fail parsing if a doc comment contains a carriage return. This requires including (LF|EOF) at the end of line so the cut operator has something to complete the line. This also removes the negative `/` from OUTER_LINE_DOC. This does not work correctly with the check for CR, and is not needed because LINE_COMMENT already matches `////`. Later I plan to include a rule for comments that makes it clear the order that they are parsed. A negative lookahead is necessary in OUTER_BLOCK_DOC to prevent it from trying to parse what should be a BLOCK_COMMENT as an OUTER_BLOCK_DOC and failing due to the cut operator. --- src/comments.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/comments.md b/src/comments.md index e82cd28ace..6e4c06744f 100644 --- a/src/comments.md +++ b/src/comments.md @@ -18,20 +18,25 @@ BLOCK_COMMENT -> `*/` @root INNER_LINE_DOC -> - `//!` ~[LF CR]* + `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) + +LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* INNER_BLOCK_DOC -> - `/*!` ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* `*/` + `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` @root OUTER_LINE_DOC -> - `///` (~`/` ~[LF CR]*)? + `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) OUTER_BLOCK_DOC -> - `/**` + `/**` ![`*` `/`] + ^ ( ~`*` | BLOCK_COMMENT_OR_DOC ) - ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* + ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` +BLOCK_CHAR -> (!(`*/` | CR) CHAR) + @root BLOCK_COMMENT_OR_DOC -> BLOCK_COMMENT | OUTER_BLOCK_DOC From bff2d5fccb80ba62fcd936b92c13150881d9633e Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 10:27:55 -0800 Subject: [PATCH 07/12] Add a new COMMENT grammar rule This is intended to indicate the order that the rules are expected to be processed (as defined in this grammar). Of course real parsers can take a different approach if they have the same results. This is roughly similar to the order that rustc takes, though [`block_comment`](https://github.com/rust-lang/rust/blob/d7daac06d87e1252d10eaa44960164faac46beff/compiler/rustc_lexer/src/lib.rs#L782-L817) roughly takes the approach of combining the `/*` prefix, and then deciding if it is an inner doc comment, outer doc comment, or else a regular block comment. LINE_COMMENT must be first so that it is not confused with a doc comment. BLOCK_COMMENT must be last so that its cut operator does not interfere with doc comments that start with `/*`. It could be moved up higher in the list if it had negative lookahead to disambiguate OUTER_BLOCK_DOC, but the expression for that is more complicated than the one in OUTER_BLOCK_DOC. --- src/comments.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/comments.md b/src/comments.md index 6e4c06744f..bbb3332539 100644 --- a/src/comments.md +++ b/src/comments.md @@ -3,7 +3,15 @@ r[comments] r[comments.syntax] ```grammar,lexer -@root LINE_COMMENT -> +@root COMMENT -> + LINE_COMMENT + | INNER_LINE_DOC + | OUTER_LINE_DOC + | INNER_BLOCK_DOC + | OUTER_BLOCK_DOC + | BLOCK_COMMENT + +LINE_COMMENT -> `//` (~[`/` `!` LF] | `//`) ~LF* | `//` EOF | `//` _immediately followed by LF_ @@ -17,7 +25,7 @@ BLOCK_COMMENT -> ( BLOCK_COMMENT_OR_DOC | ~`*/` )* `*/` -@root INNER_LINE_DOC -> +INNER_LINE_DOC -> `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* @@ -25,7 +33,7 @@ LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* INNER_BLOCK_DOC -> `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` -@root OUTER_LINE_DOC -> +OUTER_LINE_DOC -> `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) OUTER_BLOCK_DOC -> @@ -37,7 +45,7 @@ OUTER_BLOCK_DOC -> BLOCK_CHAR -> (!(`*/` | CR) CHAR) -@root BLOCK_COMMENT_OR_DOC -> +BLOCK_COMMENT_OR_DOC -> BLOCK_COMMENT | OUTER_BLOCK_DOC | INNER_BLOCK_DOC From 7c12d351303140ffcd667ec0eb6c849eea511cf3 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 10:28:57 -0800 Subject: [PATCH 08/12] Fix desugaring of doc comments rustc actually includes the spaces for doc comments. --- src/comments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/comments.md b/src/comments.md index bbb3332539..ef283a9ea1 100644 --- a/src/comments.md +++ b/src/comments.md @@ -66,7 +66,7 @@ r[comments.doc.syntax] Line doc comments beginning with exactly _three_ slashes (`///`), and block doc comments (`/** ... */`), both outer doc comments, are interpreted as a special syntax for [`doc` attributes]. r[comments.doc.attributes] -That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc="Foo"]` and `/** Bar */` turns into `#[doc="Bar"]`. They must therefore appear before something that accepts an outer attribute. +That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc=" Foo"]` and `/** Bar */` turns into `#[doc=" Bar "]`. They must therefore appear before something that accepts an outer attribute. r[comments.doc.inner-syntax] Line comments beginning with `//!` and block comments `/*! ... */` are doc comments that apply to the parent of the comment, rather than the item that follows. From f57a11ff75c458c59224992fce65d7081afdfcae Mon Sep 17 00:00:00 2001 From: Travis Cross Date: Wed, 18 Feb 2026 01:15:05 +0000 Subject: [PATCH 09/12] Add cut to `FLOAT_EXPONENT` and remove reserved alt The cut operator after (`e`|`E`) in `FLOAT_EXPONENT` reflects rustc's actual parsing behavior: once the lexer sees an exponent indicator, it commits and does not backtrack. This makes the last `RESERVED_NUMBER` alternative -- which existed to catch the empty-exponent case -- redundant, since the cut in `FLOAT_EXPONENT` now handles it directly. Co-authored-by: Eric Huss --- src/tokens.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tokens.md b/src/tokens.md index 65b000ce60..d878eabfe2 100644 --- a/src/tokens.md +++ b/src/tokens.md @@ -559,7 +559,7 @@ FLOAT_LITERAL -> | DEC_LITERAL `.` !(`.` | `_` | XID_Start) FLOAT_EXPONENT -> - (`e`|`E`) (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)* + (`e`|`E`) ^ (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)* ``` r[lex.token.literal.float.form] @@ -610,7 +610,6 @@ RESERVED_NUMBER -> | `0b` `_`* !BIN_DIGIT | `0o` `_`* !OCT_DIGIT | `0x` `_`* !HEX_DIGIT - | DEC_LITERAL ( `.` DEC_LITERAL )? (`e` | `E`) (`+` | `-`)? ``` r[lex.token.literal.reserved.intro] From ae86f1918b8e9ccca36e28391e9413ff1ab701fe Mon Sep 17 00:00:00 2001 From: Travis Cross Date: Wed, 18 Feb 2026 01:37:21 +0000 Subject: [PATCH 10/12] Fix preposition in `CharacterRange` description The description says characters can be "surrounded in backticks", but it'd be better to say "surrounded by". --- dev-guide/src/grammar.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 2dbdcf1ab4..7a4cdea466 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -131,7 +131,7 @@ The general format is a series of productions separated by blank lines. The expr | Comment | // Single line comment. | A comment extending to the end of the line. | | Terminal | \`example\` | A sequence of exact characters, surrounded by backticks. | | Charset | \[ \`A\`-\`Z\` \`0\`-\`9\` \`_\` \] | A choice from a set of characters, space-separated. There are three different forms. | -| CharacterRange | \[ \`A\`-\`Z\` \] | A range of characters. Characters can be a Unicode expression or be a literal character surrounded in backticks. | +| CharacterRange | \[ \`A\`-\`Z\` \] | A range of characters. Characters can be a Unicode expression or be a literal character surrounded by backticks. | | CharacterTerminal | \[ \`x\` \] | A single character, surrounded by backticks. | | CharacterName | \[ LF \] | A nonterminal, referring to another production. | | Prose | \ | An English description of what should be matched, surrounded in angle brackets. | From fc1589736c175015f02ecf6b7741970d00609390 Mon Sep 17 00:00:00 2001 From: Travis Cross Date: Wed, 18 Feb 2026 01:40:10 +0000 Subject: [PATCH 11/12] Fix U+xxxx notation description The grammar now accepts 4-6 hex digits for Unicode code points (needed for values above U+FFFF), so let's update the notation column to reflect the variable width. Let's also capitalize "Unicode", which is a proper noun. --- src/notation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/notation.md b/src/notation.md index ace8e65bc0..b74c74b22f 100644 --- a/src/notation.md +++ b/src/notation.md @@ -27,7 +27,7 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets: | ~`string` | ~`\n`, ~`*/` | Any characters, except this sequence | | ( ) | (`,` _Parameter_)? | Groups items | | ^ | `b'` ^ ASCII_FOR_CHAR | The rest of the sequence must match or parsing fails unconditionally ([hard cut operator]) | -| U+xxxx | U+0060 | A single unicode character | +| U+xxxx..xxxxxx | U+0060 | A single Unicode character | | \ | \ | An English description of what should be matched | | Rule suffix | IDENTIFIER_OR_KEYWORD _except `crate`_ | A modification to the previous rule | | // Comment. | // Single line comment. | A comment extending to the end of the line. | From 164f5fa02b8e729381350488605d2f8f4c39bf3c Mon Sep 17 00:00:00 2001 From: Travis Cross Date: Wed, 18 Feb 2026 01:54:52 +0000 Subject: [PATCH 12/12] Add tests for negative lookahead and Unicode These tests cover: - Parser: negative lookahead with nonterminals, terminals, charsets, grouped expressions, within sequences, repetitions, and alternations; error case for trailing `!`; Unicode code points with 4, 5, and 6 hex digits; charset ranges with `Character::Char`, `Character::Unicode`, and mixed forms; charsets combining named entries, terminals, and Unicode ranges. - Markdown renderer: negative lookahead rendering with `!`, Unicode rendering as `U+xxxx`, charset rendering with char and Unicode ranges, cut and neg expression rendering, and markdown escaping. - Railroad renderer: negative lookahead renders as a "not followed by" labeled box, Unicode renders as terminal, charset ranges, cut renders as "no backtracking" labeled box, and neg expression renders as "with the exception of" labeled box. --- tools/grammar/src/parser.rs | 331 +++++++++++++++++- .../src/grammar/render_markdown.rs | 178 ++++++++++ .../src/grammar/render_railroad.rs | 117 ++++++- 3 files changed, 624 insertions(+), 2 deletions(-) diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index 080d03c66e..0db6b478b5 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -573,7 +573,7 @@ fn translate_position(input: &str, index: usize) -> (&str, usize, usize) { #[cfg(test)] mod tests { use crate::parser::{parse_grammar, translate_position}; - use crate::{ExpressionKind, Grammar, RangeLimit}; + use crate::{Character, Characters, ExpressionKind, Grammar, RangeLimit}; use std::path::Path; #[test] @@ -778,4 +778,333 @@ mod tests { assert_eq!(max, Some(1)); assert!(matches!(limit, RangeLimit::HalfOpen)); } + + // --- Negative lookahead tests --- + + #[test] + fn lookahead_simple_nonterminal() { + let input = "Rule -> !Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else { + panic!("expected NegativeLookahead, got {:?}", rule.expression.kind); + }; + assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "Foo")); + } + + #[test] + fn lookahead_terminal() { + let input = "Rule -> !`'` Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 2); + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + assert!(matches!(&inner.kind, ExpressionKind::Terminal(t) if t == "'")); + assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "Foo")); + } + + #[test] + fn lookahead_charset() { + let input = "Rule -> ![`e` `E`] SUFFIX"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 2); + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + let ExpressionKind::Charset(chars) = &inner.kind else { + panic!("expected Charset inside lookahead, got {:?}", inner.kind); + }; + assert_eq!(chars.len(), 2); + assert!(matches!(&chars[0], Characters::Terminal(t) if t == "e")); + assert!(matches!(&chars[1], Characters::Terminal(t) if t == "E")); + } + + #[test] + fn lookahead_grouped() { + let input = "Rule -> !(`.` | `_` | XID_Start)"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else { + panic!("expected NegativeLookahead, got {:?}", rule.expression.kind); + }; + let ExpressionKind::Grouped(grouped) = &inner.kind else { + panic!("expected Grouped inside lookahead, got {:?}", inner.kind); + }; + let ExpressionKind::Alt(alts) = &grouped.kind else { + panic!("expected Alt inside Grouped, got {:?}", grouped.kind); + }; + assert_eq!(alts.len(), 3); + assert!(matches!(&alts[0].kind, ExpressionKind::Terminal(t) if t == ".")); + assert!(matches!(&alts[1].kind, ExpressionKind::Terminal(t) if t == "_")); + assert!(matches!(&alts[2].kind, ExpressionKind::Nt(n) if n == "XID_Start")); + } + + #[test] + fn lookahead_in_sequence_middle() { + let input = "Rule -> A !B C"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 3); + assert!(matches!(&seq[0].kind, ExpressionKind::Nt(n) if n == "A")); + let ExpressionKind::NegativeLookahead(inner) = &seq[1].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[1].kind); + }; + assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "B")); + assert!(matches!(&seq[2].kind, ExpressionKind::Nt(n) if n == "C")); + } + + #[test] + fn lookahead_in_repetition() { + let input = "Rule -> (!A B)*"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Repeat(rep) = &rule.expression.kind else { + panic!("expected Repeat, got {:?}", rule.expression.kind); + }; + let ExpressionKind::Grouped(grouped) = &rep.kind else { + panic!("expected Grouped inside Repeat, got {:?}", rep.kind); + }; + let ExpressionKind::Sequence(seq) = &grouped.kind else { + panic!("expected Sequence inside Grouped, got {:?}", grouped.kind); + }; + assert_eq!(seq.len(), 2); + assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_))); + assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B")); + } + + #[test] + fn lookahead_in_alternation() { + let input = "Rule -> !A B | C"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Alt(alts) = &rule.expression.kind else { + panic!("expected Alt, got {:?}", rule.expression.kind); + }; + assert_eq!(alts.len(), 2); + let ExpressionKind::Sequence(seq) = &alts[0].kind else { + panic!("expected Sequence, got {:?}", alts[0].kind); + }; + assert_eq!(seq.len(), 2); + assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_))); + assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B")); + assert!(matches!(&alts[1].kind, ExpressionKind::Nt(n) if n == "C")); + } + + #[test] + fn lookahead_fail_trailing() { + let input = "Rule -> !"; + let err = parse(input).unwrap_err(); + assert!(err.contains("expected expression after !")); + } + + // --- Unicode tests --- + + #[test] + fn unicode_4_digit() { + let input = "Rule -> U+0009"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else { + panic!("expected Unicode, got {:?}", rule.expression.kind); + }; + assert_eq!(*ch, '\t'); + assert_eq!(s, "0009"); + } + + #[test] + fn unicode_5_digit() { + let input = "Rule -> U+E0000"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else { + panic!("expected Unicode, got {:?}", rule.expression.kind); + }; + assert_eq!(*ch, '\u{E0000}'); + assert_eq!(s, "E0000"); + } + + #[test] + fn unicode_6_digit() { + let input = "Rule -> U+10FFFF"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else { + panic!("expected Unicode, got {:?}", rule.expression.kind); + }; + assert_eq!(*ch, '\u{10FFFF}'); + assert_eq!(s, "10FFFF"); + } + + #[test] + fn unicode_in_alternation() { + let input = "Rule -> U+0009 | U+000A"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Alt(alts) = &rule.expression.kind else { + panic!("expected Alt, got {:?}", rule.expression.kind); + }; + assert_eq!(alts.len(), 2); + assert!(matches!( + &alts[0].kind, + ExpressionKind::Unicode((ch, _)) if *ch == '\t' + )); + assert!(matches!( + &alts[1].kind, + ExpressionKind::Unicode((ch, _)) if *ch == '\n' + )); + } + + // --- Character / charset range tests --- + + #[test] + fn charset_unicode_range() { + let input = "Rule -> [U+0000-U+007F]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); + assert!(matches!( + b, + Character::Unicode((ch, _)) if *ch == '\u{7F}' + )); + } + + #[test] + fn charset_char_range() { + let input = "Rule -> [`a`-`z`]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Char(ch) if *ch == 'a')); + assert!(matches!(b, Character::Char(ch) if *ch == 'z')); + } + + #[test] + fn charset_mixed_range() { + let input = "Rule -> [`a`-U+007A]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Char(ch) if *ch == 'a')); + assert!(matches!( + b, + Character::Unicode((ch, _)) if *ch == 'z' + )); + } + + #[test] + fn charset_multiple_unicode_ranges() { + let input = "Rule -> [U+0000-U+D7FF U+E000-U+10FFFF]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 2); + let Characters::Range(a1, b1) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a1, Character::Unicode((ch, _)) if *ch == '\0')); + assert!(matches!(b1, Character::Unicode((ch, _)) if *ch == '\u{D7FF}')); + let Characters::Range(a2, b2) = &chars[1] else { + panic!("expected Range, got {:?}", chars[1]); + }; + assert!(matches!(a2, Character::Unicode((ch, _)) if *ch == '\u{E000}')); + assert!(matches!(b2, Character::Unicode((ch, _)) if *ch == '\u{10FFFF}')); + } + + #[test] + fn charset_terminals_and_named() { + let input = "Rule -> [`a` `b` Foo]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 3); + assert!(matches!(&chars[0], Characters::Terminal(t) if t == "a")); + assert!(matches!(&chars[1], Characters::Terminal(t) if t == "b")); + assert!(matches!(&chars[2], Characters::Named(n) if n == "Foo")); + } + + // --- Negative lookahead combined with charset --- + + #[test] + fn lookahead_charset_with_named_and_terminals() { + // Pattern from tokens.md: ![`'` `\` LF CR TAB] ASCII + let input = "Rule -> ![`x` `y` LF] Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 2); + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + let ExpressionKind::Charset(chars) = &inner.kind else { + panic!("expected Charset, got {:?}", inner.kind); + }; + assert_eq!(chars.len(), 3); + assert!(matches!(&chars[0], Characters::Terminal(t) if t == "x")); + assert!(matches!(&chars[1], Characters::Terminal(t) if t == "y")); + assert!(matches!(&chars[2], Characters::Named(n) if n == "LF")); + } + + // --- Negative lookahead combined with Unicode --- + + #[test] + fn lookahead_charset_with_unicode_range() { + let input = "Rule -> ![U+0000-U+007F] Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + let ExpressionKind::Charset(chars) = &inner.kind else { + panic!("expected Charset, got {:?}", inner.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); + assert!(matches!( + b, + Character::Unicode((ch, _)) if *ch == '\u{7F}' + )); + } } diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index 7f9624fc86..316eb9aaf3 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -250,3 +250,181 @@ fn markdown_escape(s: &str) -> Cow<'_, str> { LazyLock::new(|| Regex::new(r#"[\\`_*\[\](){}'".-]"#).unwrap()); ESC_RE.replace_all(s, r"\$0") } + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + /// Creates a minimal `RenderCtx` for testing. + fn test_cx() -> RenderCtx { + RenderCtx { + md_link_map: HashMap::new(), + rr_link_map: HashMap::new(), + for_summary: false, + } + } + + /// Renders a single expression to a markdown string. + fn render(kind: ExpressionKind) -> String { + let cx = test_cx(); + let expr = Expression::new_kind(kind); + let mut output = String::new(); + render_expression(&expr, &cx, &mut output); + output + } + + // -- Negative lookahead tests -- + + #[test] + fn lookahead_nonterminal() { + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Nt("CHAR".to_string())), + ))); + assert!(result.contains("!"), "should contain `!` prefix"); + assert!( + result.contains("CHAR"), + "should contain the nonterminal name" + ); + } + + #[test] + fn lookahead_terminal() { + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Terminal("'".to_string())), + ))); + assert!(result.starts_with("!"), "should start with `!`"); + assert!( + result.contains("grammar-literal"), + "should render inner terminal as a grammar literal" + ); + } + + #[test] + fn lookahead_charset() { + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![ + Characters::Terminal("e".to_string()), + Characters::Terminal("E".to_string()), + ])), + ))); + assert!(result.starts_with("!"), "should start with `!`"); + assert!( + result.contains("\\["), + "should contain escaped opening bracket for charset" + ); + } + + #[test] + fn lookahead_grouped() { + // !( `.` | `_` ) + let inner = + ExpressionKind::Grouped(Box::new(Expression::new_kind(ExpressionKind::Alt(vec![ + Expression::new_kind(ExpressionKind::Terminal(".".to_string())), + Expression::new_kind(ExpressionKind::Terminal("_".to_string())), + ])))); + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(inner), + ))); + assert!(result.starts_with("!(")); + assert!(result.contains("|")); + } + + // -- Unicode tests -- + + #[test] + fn unicode_4_digit() { + let result = render(ExpressionKind::Unicode(('\t', "0009".to_string()))); + assert_eq!(result, "U+0009"); + } + + #[test] + fn unicode_6_digit() { + let result = render(ExpressionKind::Unicode(( + '\u{10FFFF}', + "10FFFF".to_string(), + ))); + assert_eq!(result, "U+10FFFF"); + } + + // -- Charset with Unicode range tests -- + + #[test] + fn charset_unicode_range() { + let result = render(ExpressionKind::Charset(vec![Characters::Range( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + )])); + assert!(result.contains("\\[")); + assert!(result.contains("U+0000")); + assert!(result.contains("U+007F")); + assert!(result.contains("-")); + } + + #[test] + fn charset_char_range() { + let result = render(ExpressionKind::Charset(vec![Characters::Range( + Character::Char('a'), + Character::Char('z'), + )])); + assert!(result.contains("\\[")); + assert!(result.contains("grammar-literal")); + assert!(result.contains("-")); + } + + #[test] + fn charset_mixed_range() { + // [`a`-U+007A] + let result = render(ExpressionKind::Charset(vec![Characters::Range( + Character::Char('a'), + Character::Unicode(('\u{007A}', "007A".to_string())), + )])); + assert!(result.contains("grammar-literal")); + assert!(result.contains("U+007A")); + assert!(result.contains("-")); + } + + // -- Cut test -- + + #[test] + fn cut_rendering() { + let result = render(ExpressionKind::Cut(Box::new(Expression::new_kind( + ExpressionKind::Nt("Foo".to_string()), + )))); + assert!(result.starts_with("^ "), "cut should render as `^ ` prefix"); + assert!(result.contains("Foo")); + } + + // -- NegExpression test -- + + #[test] + fn neg_expression_rendering() { + let result = render(ExpressionKind::NegExpression(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( + "a".to_string(), + )])), + ))); + assert!( + result.starts_with("~"), + "neg expression should render as `~` prefix" + ); + } + + // -- Markdown escape tests -- + + #[test] + fn markdown_escape_backtick() { + assert_eq!(markdown_escape("`"), "\\`"); + } + + #[test] + fn markdown_escape_brackets() { + assert_eq!(markdown_escape("["), "\\["); + assert_eq!(markdown_escape("]"), "\\]"); + } + + #[test] + fn markdown_escape_plain() { + assert_eq!(markdown_escape("abc"), "abc"); + } +} diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index 1543127889..ad7b291e57 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -391,7 +391,7 @@ impl Node for Except { #[cfg(test)] mod tests { use super::*; - use grammar::{Expression, ExpressionKind, RangeLimit}; + use grammar::{Character, Characters, Expression, ExpressionKind, RangeLimit}; /// Render an expression to an SVG string fragment. fn render_to_svg(expr: &Expression) -> Option { @@ -411,6 +411,8 @@ mod tests { }) } + // -- RepeatRange tests -- + #[test] fn test_empty_exclusive_equal() { // `e{2..2}` (half-open, min == max) renders as empty. @@ -476,4 +478,117 @@ mod tests { "expected nonterminal for e{{..=1}}, got: {svg}" ); } + + // -- Negative lookahead tests -- + + #[test] + fn lookahead_nonterminal() { + let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Nt("CHAR".to_string())), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!( + svg.contains("not followed by"), + "should contain the 'not followed by' label" + ); + assert!(svg.contains("CHAR"), "should contain the nonterminal name"); + } + + #[test] + fn lookahead_terminal() { + let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Terminal("CR".to_string())), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("not followed by")); + assert!(svg.contains("CR")); + } + + #[test] + fn lookahead_charset() { + let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![ + Characters::Terminal("e".to_string()), + Characters::Terminal("E".to_string()), + ])), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("not followed by")); + assert!(svg.contains("e")); + assert!(svg.contains("E")); + } + + // -- Unicode tests -- + + #[test] + fn unicode_4_digit() { + let expr = Expression::new_kind(ExpressionKind::Unicode(('\t', "0009".to_string()))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("U+0009"), "should render Unicode code point"); + } + + #[test] + fn unicode_6_digit() { + let expr = Expression::new_kind(ExpressionKind::Unicode(( + '\u{10FFFF}', + "10FFFF".to_string(), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("U+10FFFF")); + } + + // -- Charset with ranges -- + + #[test] + fn charset_unicode_range() { + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + )])); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("U+0000")); + assert!(svg.contains("U+007F")); + } + + #[test] + fn charset_char_range() { + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( + Character::Char('a'), + Character::Char('z'), + )])); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("a")); + assert!(svg.contains("z")); + } + + // -- Cut test -- + + #[test] + fn cut_rendering() { + let expr = Expression::new_kind(ExpressionKind::Cut(Box::new(Expression::new_kind( + ExpressionKind::Nt("Foo".to_string()), + )))); + let svg = render_to_svg(&expr).unwrap(); + assert!( + svg.contains("no backtracking"), + "cut should render with 'no backtracking' label" + ); + assert!(svg.contains("Foo")); + } + + // -- NegExpression test -- + + #[test] + fn neg_expression_rendering() { + let expr = Expression::new_kind(ExpressionKind::NegExpression(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( + "a".to_string(), + )])), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!( + svg.contains("with the exception of"), + "neg expression should have exception label" + ); + } }