From fef395cfdad8fb083f329db23bb6712e291d798f Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Tue, 17 Feb 2026 17:51:57 -0800 Subject: [PATCH 1/3] Add support for named repeat ranges This adds the ability to add a label to a repeat range so that a subsequent expression can match the same repetition. This is intended to help with expressing things like raw strings where the `#` characters must be balanced on both sides, with a limit on the number of matches. --- dev-guide/src/grammar.md | 9 +++++++-- src/notation.md | 2 ++ tools/grammar/src/lib.rs | 6 +++++- tools/grammar/src/parser.rs | 20 ++++++++++++++++++- .../src/grammar/render_markdown.rs | 9 ++++++++- .../src/grammar/render_railroad.rs | 17 ++++++++++++++++ 6 files changed, 58 insertions(+), 5 deletions(-) diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 7a4cdea466..8f536a2ace 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -57,6 +57,7 @@ Quantifier -> | RepeatPlusNonGreedy | RepeatRange | RepeatRangeInclusive + | RepeatRangeNamed Optional -> `?` @@ -68,9 +69,11 @@ RepeatPlus -> `+` RepeatPlusNonGreedy -> `+?` -RepeatRange -> `{` Range? `..` Range? `}` +RepeatRange -> `{` ( Name `:` )? Range? `..` Range? `}` -RepeatRangeInclusive -> `{` Range? `..=` Range `}` +RepeatRangeInclusive -> `{` ( Name `:` )? Range? `..=` Range `}` + +RepeatRangeNamed -> `{` Name `}` Range -> [0-9]+ @@ -150,6 +153,8 @@ The general format is a series of productions separated by blank lines. The expr | RepeatPlusNonGreedy | Expr+? | The preceding expression is repeated 1 or more times without being greedy. | | RepeatRange | Expr{2..4} | The preceding expression is repeated between the range of times specified. Either bound can be excluded, which works just like Rust ranges. | | RepeatRangeInclusive | Expr{2..=4} | The preceding expression is repeated between the inclusive range of times specified. The lower bound can be omitted. | +| Named RepeatRangeInclusive | Expr{name:2..=4} | If a name precedes the range, then the number of repetitions are stored in a variable with that name that subsequent RepeatRangeNamed expressions can refer to. | +| RepeatRangeNamed | Expr{name} | Repeat the number of times from the previously labeled repetition. | ## Automatic linking diff --git a/src/notation.md b/src/notation.md index b74c74b22f..ce3eee2ef8 100644 --- a/src/notation.md +++ b/src/notation.md @@ -18,6 +18,8 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets: | x+ | _MacroMatch_+ | 1 or more of x | | xa..b | HEX_DIGIT1..6 | a to b repetitions of x, exclusive of b | | xa..=b | HEX_DIGIT1..=5 | a to b repetitions of x, inclusive of b | +| xn:a..=b | `#`n:1..=255 | a labeled repetition that a subsequent repetition can refer to | +| xn | `#`n | repeat the number of times from the previously labeled repetition | | Rule1 Rule2 | `fn` _Name_ _Parameters_ | Sequence of rules in order | | \| | `u8` \| `u16`, Block \| Item | Either one or another | | ! | !COMMENT | Matches if the expression does not follow, without consuming any input | diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 1d64e45143..5a7fb3c460 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -61,13 +61,16 @@ pub enum ExpressionKind { RepeatPlus(Box), /// `A+?` RepeatPlusNonGreedy(Box), - /// `A{2..4}` or `A{2..=4}` + /// `A{2..4}` or `A{2..=4}` or `A{name:2..=4}` RepeatRange { expr: Box, + name: Option, min: Option, max: Option, limit: RangeLimit, }, + /// `A{name}` + RepeatRangeNamed(Box, String), /// `NonTerminal` Nt(String), /// `` `string` `` @@ -172,6 +175,7 @@ impl Expression { | ExpressionKind::RepeatPlus(e) | ExpressionKind::RepeatPlusNonGreedy(e) | ExpressionKind::RepeatRange { expr: e, .. } + | ExpressionKind::RepeatRangeNamed(e, _) | ExpressionKind::NegExpression(e) | ExpressionKind::Cut(e) => { e.visit_nt(callback); diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index 0db6b478b5..e6f465efd4 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -459,9 +459,26 @@ impl Parser<'_> { }) } - /// Parse `{a..b}` | `{a..=b}` after expression. + /// Parse `{a..b}` | `{a..=b}` | `{name:a..=b}` | `{name}` after expression. + // + // `name:` before the range is a named binding. `{name}` refers to that binding. fn parse_repeat_range(&mut self, kind: ExpressionKind) -> Result { self.expect("{", "expected `{`")?; + let start = self.index; + let name = match (self.parse_name(), self.peek()) { + (Some(name), Some(b':')) => { + self.index += 1; + Some(name) + } + (Some(name), Some(b'}')) => { + self.index += 1; + return Ok(ExpressionKind::RepeatRangeNamed(box_kind(kind), name)); + } + _ => { + self.index = start; + None + } + }; let min = self.take_while(&|x| x.is_ascii_digit()); let Ok(min) = (!min.is_empty()).then(|| min.parse::()).transpose() else { bail!(self, "malformed range start"); @@ -492,6 +509,7 @@ impl Parser<'_> { self.expect("}", "expected `}`")?; Ok(ExpressionKind::RepeatRange { expr: box_kind(kind), + name, min, max, limit, diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index 316eb9aaf3..1cc76f781b 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -73,6 +73,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { | ExpressionKind::RepeatPlus(_) | ExpressionKind::RepeatPlusNonGreedy(_) | ExpressionKind::RepeatRange { .. } + | ExpressionKind::RepeatRangeNamed(_, _) | ExpressionKind::Nt(_) | ExpressionKind::Terminal(_) | ExpressionKind::Prose(_) @@ -142,6 +143,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { } ExpressionKind::RepeatRange { expr, + name, min, max, limit, @@ -149,12 +151,17 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { render_expression(expr, cx, output); write!( output, - "{min}{limit}{max}", + "{name}{min}{limit}{max}", + name = name.as_ref().map(|n| format!("{n}:")).unwrap_or_default(), min = min.map(|v| v.to_string()).unwrap_or_default(), max = max.map(|v| v.to_string()).unwrap_or_default(), ) .unwrap(); } + ExpressionKind::RepeatRangeNamed(e, name) => { + render_expression(e, cx, output); + write!(output, "{name}").unwrap(); + } ExpressionKind::Nt(nt) => { let dest = cx.md_link_map.get(nt).map_or("missing", |d| d.as_str()); write!(output, "[{nt}]({dest})").unwrap(); diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index ad7b291e57..94f8060962 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -81,6 +81,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option Option Option Option Option Option Option Option Option Option unreachable!("closed range must have upper bound"), + ExpressionKind::RepeatRangeNamed(e, name) => { + let n = render_expression(e, cx, stack)?; + let cmt = format!("repeat exactly {name} times"); + let lbox = LabeledBox::new(n, Comment::new(cmt)); + Box::new(lbox) + } ExpressionKind::Nt(nt) => node_for_nt(cx, nt), ExpressionKind::Terminal(t) => Box::new(Terminal::new(t.clone())), ExpressionKind::Prose(s) => Box::new(Terminal::new(s.clone())), @@ -405,6 +421,7 @@ mod tests { fn range_expr(min: Option, max: Option, limit: RangeLimit) -> Expression { Expression::new_kind(ExpressionKind::RepeatRange { expr: Box::new(Expression::new_kind(ExpressionKind::Nt("e".to_string()))), + name: None, min, max, limit, From 0007c5000351e03d13a0f95d68c0b38662ca3589 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Tue, 17 Feb 2026 18:01:46 -0800 Subject: [PATCH 2/3] Change raw strings to use named repetitions This changes the raw string grammars to use named repetition to represent that the `#` characters need to be balanced within a specific limit. This also adds a cut after the `#` and before the `"` because rustc generates an error in this situation if a `"` is not found. It's maybe not the prettiest, and I'm on the fence whether this makes it clearer. --- src/tokens.md | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/tokens.md b/src/tokens.md index d878eabfe2..0f0964bfce 100644 --- a/src/tokens.md +++ b/src/tokens.md @@ -214,11 +214,13 @@ r[lex.token.literal.str-raw] r[lex.token.literal.str-raw.syntax] ```grammar,lexer -RAW_STRING_LITERAL -> `r` RAW_STRING_CONTENT SUFFIX? +RAW_STRING_LITERAL -> + `r` `"` ^ RAW_STRING_CONTENT `"` SUFFIX? + | `r` `#`{n:1..=255} ^ `"` RAW_STRING_CONTENT_HASHED `"` `#`{n} SUFFIX? -RAW_STRING_CONTENT -> - `"` ^ ( ~CR )*? `"` - | `#` RAW_STRING_CONTENT `#` +RAW_STRING_CONTENT -> (!`"` ~CR )* + +RAW_STRING_CONTENT_HASHED -> (!(`"` `#`{n}) ~CR )* ``` r[lex.token.literal.str-raw.intro] @@ -301,11 +303,12 @@ r[lex.token.str-byte-raw] r[lex.token.str-byte-raw.syntax] ```grammar,lexer RAW_BYTE_STRING_LITERAL -> - `br` RAW_BYTE_STRING_CONTENT SUFFIX? + `br` `"` ^ RAW_BYTE_STRING_CONTENT `"` SUFFIX? + | `br` `#`{n:1..=255} ^ `"` RAW_BYTE_STRING_CONTENT_HASHED `"` `#`{n} SUFFIX? + +RAW_BYTE_STRING_CONTENT -> (!`"` ASCII_FOR_RAW )* -RAW_BYTE_STRING_CONTENT -> - `"` ^ ASCII_FOR_RAW*? `"` - | `#` RAW_BYTE_STRING_CONTENT `#` +RAW_BYTE_STRING_CONTENT_HASHED -> (!(`"` `#`{n}) ASCII_FOR_RAW )* ASCII_FOR_RAW -> !CR ASCII ``` @@ -395,11 +398,12 @@ r[lex.token.str-c-raw] r[lex.token.str-c-raw.syntax] ```grammar,lexer RAW_C_STRING_LITERAL -> - `cr` RAW_C_STRING_CONTENT SUFFIX? + `cr` `"` ^ RAW_C_STRING_CONTENT `"` SUFFIX? + | `cr` `#`{n:1..=255} ^ `"` RAW_C_STRING_CONTENT_HASHED `"` `#`{n} SUFFIX? + +RAW_C_STRING_CONTENT -> (!`"` ~[CR NUL] )* -RAW_C_STRING_CONTENT -> - `"` ^ ( ~[CR NUL] )*? `"` - | `#` RAW_C_STRING_CONTENT `#` +RAW_C_STRING_CONTENT_HASHED -> (!(`"` `#`{n}) ~[CR NUL] )* ``` r[lex.token.str-c-raw.intro] From 38cdee0e0beb3f8edf888163adbf57b79c4e1c08 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Tue, 17 Feb 2026 18:05:38 -0800 Subject: [PATCH 3/3] Remove non-greedy grammar support These non-greedy repetitions are no longer needed because we now use negative lookahead in the rules that were using them. This is intended to simplify things a little, and lean in on the negative lookahead. There were two alternate interpretations of `R1 -> A E*? S B`: R1 -> A _0 B _0 -> S | (E _0) or R1 -> A _0 _0 -> (S B) | (E _0) Rather than trying to document this subtle interpretation, this chooses to just get rid of it and be explicit about what is not allowed to follow. --- dev-guide/src/grammar.md | 8 -------- tools/grammar/src/lib.rs | 6 ------ tools/grammar/src/parser.rs | 16 ++++------------ tools/mdbook-spec/src/grammar/render_markdown.rs | 10 ---------- tools/mdbook-spec/src/grammar/render_railroad.rs | 12 ------------ 5 files changed, 4 insertions(+), 48 deletions(-) diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 8f536a2ace..341c09522c 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -52,9 +52,7 @@ Footnote -> `[^` ~[`]` LF]+ `]` Quantifier -> Optional | Repeat - | RepeatNonGreedy | RepeatPlus - | RepeatPlusNonGreedy | RepeatRange | RepeatRangeInclusive | RepeatRangeNamed @@ -63,12 +61,8 @@ Optional -> `?` Repeat -> `*` -RepeatNonGreedy -> `*?` - RepeatPlus -> `+` -RepeatPlusNonGreedy -> `+?` - RepeatRange -> `{` ( Name `:` )? Range? `..` Range? `}` RepeatRangeInclusive -> `{` ( Name `:` )? Range? `..=` Range `}` @@ -148,9 +142,7 @@ The general format is a series of productions separated by blank lines. The expr | Optional | Expr? | The preceding expression is optional. | | NegativeLookahead | !Expr | Matches if Expr does not follow, without consuming any input. | | Repeat | Expr* | The preceding expression is repeated 0 or more times. | -| RepeatNonGreedy | Expr*? | The preceding expression is repeated 0 or more times without being greedy. | | RepeatPlus | Expr+ | The preceding expression is repeated 1 or more times. | -| RepeatPlusNonGreedy | Expr+? | The preceding expression is repeated 1 or more times without being greedy. | | RepeatRange | Expr{2..4} | The preceding expression is repeated between the range of times specified. Either bound can be excluded, which works just like Rust ranges. | | RepeatRangeInclusive | Expr{2..=4} | The preceding expression is repeated between the inclusive range of times specified. The lower bound can be omitted. | | Named RepeatRangeInclusive | Expr{name:2..=4} | If a name precedes the range, then the number of repetitions are stored in a variable with that name that subsequent RepeatRangeNamed expressions can refer to. | diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 5a7fb3c460..c16a07211c 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -55,12 +55,8 @@ pub enum ExpressionKind { NegativeLookahead(Box), /// `A*` Repeat(Box), - /// `A*?` - RepeatNonGreedy(Box), /// `A+` RepeatPlus(Box), - /// `A+?` - RepeatPlusNonGreedy(Box), /// `A{2..4}` or `A{2..=4}` or `A{name:2..=4}` RepeatRange { expr: Box, @@ -171,9 +167,7 @@ impl Expression { | ExpressionKind::Optional(e) | ExpressionKind::NegativeLookahead(e) | ExpressionKind::Repeat(e) - | ExpressionKind::RepeatNonGreedy(e) | ExpressionKind::RepeatPlus(e) - | ExpressionKind::RepeatPlusNonGreedy(e) | ExpressionKind::RepeatRange { expr: e, .. } | ExpressionKind::RepeatRangeNamed(e, _) | ExpressionKind::NegExpression(e) diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index e6f465efd4..a48674c201 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -439,24 +439,16 @@ impl Parser<'_> { Ok(ExpressionKind::Optional(box_kind(kind))) } - /// Parse `*` | `*?` after expression. + /// Parse `*` after expression. fn parse_repeat(&mut self, kind: ExpressionKind) -> Result { self.expect("*", "expected `*`")?; - Ok(if self.take_str("?") { - ExpressionKind::RepeatNonGreedy(box_kind(kind)) - } else { - ExpressionKind::Repeat(box_kind(kind)) - }) + Ok(ExpressionKind::Repeat(box_kind(kind))) } - /// Parse `+` | `+?` after expression. + /// Parse `+` after expression. fn parse_repeat_plus(&mut self, kind: ExpressionKind) -> Result { self.expect("+", "expected `+`")?; - Ok(if self.take_str("?") { - ExpressionKind::RepeatPlusNonGreedy(box_kind(kind)) - } else { - ExpressionKind::RepeatPlus(box_kind(kind)) - }) + Ok(ExpressionKind::RepeatPlus(box_kind(kind))) } /// Parse `{a..b}` | `{a..=b}` | `{name:a..=b}` | `{name}` after expression. diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index 1cc76f781b..d79c949325 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -69,9 +69,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { | ExpressionKind::Optional(_) | ExpressionKind::NegativeLookahead(_) | ExpressionKind::Repeat(_) - | ExpressionKind::RepeatNonGreedy(_) | ExpressionKind::RepeatPlus(_) - | ExpressionKind::RepeatPlusNonGreedy(_) | ExpressionKind::RepeatRange { .. } | ExpressionKind::RepeatRangeNamed(_, _) | ExpressionKind::Nt(_) @@ -129,18 +127,10 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { render_expression(e, cx, output); output.push_str("\\*"); } - ExpressionKind::RepeatNonGreedy(e) => { - render_expression(e, cx, output); - output.push_str("\\* (non-greedy)"); - } ExpressionKind::RepeatPlus(e) => { render_expression(e, cx, output); output.push_str("+"); } - ExpressionKind::RepeatPlusNonGreedy(e) => { - render_expression(e, cx, output); - output.push_str("+ (non-greedy)"); - } ExpressionKind::RepeatRange { expr, name, diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index 94f8060962..aed4e8f151 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -174,12 +174,6 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option { - let n = render_expression(e, cx, stack)?; - let r = Box::new(Optional::new(Repeat::new(n, railroad::Empty))); - let lbox = LabeledBox::new(r, Comment::new("non-greedy".to_string())); - Box::new(lbox) - } // Treat `e+` and `e{1..}` equally. ExpressionKind::RepeatPlus(e) | ExpressionKind::RepeatRange { @@ -192,12 +186,6 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option { - let n = render_expression(e, cx, stack)?; - let r = Repeat::new(n, railroad::Empty); - let lbox = LabeledBox::new(r, Comment::new("non-greedy".to_string())); - Box::new(lbox) - } // For `e{..=0}` / `e{0..=0}` or `e{..1}` / `e{0..1}` render an empty node. ExpressionKind::RepeatRange { max: Some(0), .. } | ExpressionKind::RepeatRange {