diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 40e4883096..7a4cdea466 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -39,7 +39,11 @@ Sequence -> (` `* AdornedExpr)* ` `* Cut | (` `* AdornedExpr)+ -AdornedExpr -> Expr1 Quantifier? Suffix? Footnote? +AdornedExpr -> Prefix? Expr1 Quantifier? Suffix? Footnote? + +Prefix -> NegativeLookahead + +NegativeLookahead -> `!` Suffix -> ` _` * `_` @@ -81,7 +85,7 @@ Expr1 -> | Group | NegativeExpression -Unicode -> `U+` [`A`-`Z` `0`-`9`]4..=4 +Unicode -> `U+` [`A`-`Z` `0`-`9`]4..=6 NonTerminal -> Name @@ -98,7 +102,11 @@ Characters -> | CharacterTerminal | CharacterName -CharacterRange -> BACKTICK BACKTICK `-` BACKTICK BACKTICK +CharacterRange -> Character `-` Character + +Character -> + BACKTICK BACKTICK + | Unicode CharacterTerminal -> Terminal @@ -123,7 +131,7 @@ The general format is a series of productions separated by blank lines. The expr | Comment | // Single line comment. | A comment extending to the end of the line. | | Terminal | \`example\` | A sequence of exact characters, surrounded by backticks. | | Charset | \[ \`A\`-\`Z\` \`0\`-\`9\` \`_\` \] | A choice from a set of characters, space-separated. There are three different forms. | -| CharacterRange | \[ \`A\`-\`Z\` \] | A range of characters; each character should be in backticks. | +| CharacterRange | \[ \`A\`-\`Z\` \] | A range of characters. Characters can be a Unicode expression or be a literal character surrounded by backticks. | | CharacterTerminal | \[ \`x\` \] | A single character, surrounded by backticks. | | CharacterName | \[ LF \] | A nonterminal, referring to another production. | | Prose | \ | An English description of what should be matched, surrounded in angle brackets. | @@ -135,6 +143,7 @@ The general format is a series of productions separated by blank lines. The expr | Suffix | \_except \[LazyBooleanExpression\]\_ | Adds a suffix to the previous expression to provide an additional English description, rendered in subscript. This can contain limited Markdown, but try to avoid anything except basics like links. | | Footnote | \[^extern-safe\] | Adds a footnote, which can supply extra information that may be helpful to the user. The footnote itself should be defined outside of the code block like a normal Markdown footnote. | | Optional | Expr? | The preceding expression is optional. | +| NegativeLookahead | !Expr | Matches if Expr does not follow, without consuming any input. | | Repeat | Expr* | The preceding expression is repeated 0 or more times. | | RepeatNonGreedy | Expr*? | The preceding expression is repeated 0 or more times without being greedy. | | RepeatPlus | Expr+ | The preceding expression is repeated 1 or more times. | diff --git a/src/comments.md b/src/comments.md index a240e7dc58..ef283a9ea1 100644 --- a/src/comments.md +++ b/src/comments.md @@ -3,34 +3,49 @@ r[comments] r[comments.syntax] ```grammar,lexer -@root LINE_COMMENT -> +@root COMMENT -> + LINE_COMMENT + | INNER_LINE_DOC + | OUTER_LINE_DOC + | INNER_BLOCK_DOC + | OUTER_BLOCK_DOC + | BLOCK_COMMENT + +LINE_COMMENT -> `//` (~[`/` `!` LF] | `//`) ~LF* - | `//` + | `//` EOF + | `//` _immediately followed by LF_ BLOCK_COMMENT -> - `/*` + `/**/` + | `/***/` + | `/*` + ^ ( ~[`*` `!`] | `**` | BLOCK_COMMENT_OR_DOC ) ( BLOCK_COMMENT_OR_DOC | ~`*/` )* `*/` - | `/**/` - | `/***/` -@root INNER_LINE_DOC -> - `//!` ~[LF CR]* +INNER_LINE_DOC -> + `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) + +LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* INNER_BLOCK_DOC -> - `/*!` ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* `*/` + `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` -@root OUTER_LINE_DOC -> - `///` (~`/` ~[LF CR]*)? +OUTER_LINE_DOC -> + `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) OUTER_BLOCK_DOC -> - `/**` + `/**` ![`*` `/`] + ^ ( ~`*` | BLOCK_COMMENT_OR_DOC ) - ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* + ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` -@root BLOCK_COMMENT_OR_DOC -> +BLOCK_CHAR -> (!(`*/` | CR) CHAR) + +BLOCK_COMMENT_OR_DOC -> BLOCK_COMMENT | OUTER_BLOCK_DOC | INNER_BLOCK_DOC @@ -51,7 +66,7 @@ r[comments.doc.syntax] Line doc comments beginning with exactly _three_ slashes (`///`), and block doc comments (`/** ... */`), both outer doc comments, are interpreted as a special syntax for [`doc` attributes]. r[comments.doc.attributes] -That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc="Foo"]` and `/** Bar */` turns into `#[doc="Bar"]`. They must therefore appear before something that accepts an outer attribute. +That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc=" Foo"]` and `/** Bar */` turns into `#[doc=" Bar "]`. They must therefore appear before something that accepts an outer attribute. r[comments.doc.inner-syntax] Line comments beginning with `//!` and block comments `/*! ... */` are doc comments that apply to the parent of the comment, rather than the item that follows. diff --git a/src/identifiers.md b/src/identifiers.md index 979284a1c7..5abe303ca2 100644 --- a/src/identifiers.md +++ b/src/identifiers.md @@ -16,7 +16,7 @@ NON_KEYWORD_IDENTIFIER -> IDENTIFIER_OR_KEYWORD _except a [strict][lex.keywords. IDENTIFIER -> NON_KEYWORD_IDENTIFIER | RAW_IDENTIFIER RESERVED_RAW_IDENTIFIER -> - `r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by XID_Continue_ + `r#` (`_` | `crate` | `self` | `Self` | `super`) !XID_Continue ``` diff --git a/src/input-format.md b/src/input-format.md index 2432da0339..3e35cba1ee 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -3,9 +3,13 @@ r[input] r[input.syntax] ```grammar,lexer -@root CHAR -> +CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value + +ASCII -> [U+0000-U+007F] NUL -> U+0000 + +EOF -> !CHAR // End of file or input ``` r[input.intro] diff --git a/src/notation.md b/src/notation.md index 850ee9fb5e..b74c74b22f 100644 --- a/src/notation.md +++ b/src/notation.md @@ -20,13 +20,14 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets: | xa..=b | HEX_DIGIT1..=5 | a to b repetitions of x, inclusive of b | | Rule1 Rule2 | `fn` _Name_ _Parameters_ | Sequence of rules in order | | \| | `u8` \| `u16`, Block \| Item | Either one or another | +| ! | !COMMENT | Matches if the expression does not follow, without consuming any input | | \[ ] | \[`b` `B`] | Any of the characters listed | | \[ - ] | \[`a`-`z`] | Any of the characters in the range | | ~\[ ] | ~\[`b` `B`] | Any characters, except those listed | | ~`string` | ~`\n`, ~`*/` | Any characters, except this sequence | | ( ) | (`,` _Parameter_)? | Groups items | | ^ | `b'` ^ ASCII_FOR_CHAR | The rest of the sequence must match or parsing fails unconditionally ([hard cut operator]) | -| U+xxxx | U+0060 | A single unicode character | +| U+xxxx..xxxxxx | U+0060 | A single Unicode character | | \ | \ | An English description of what should be matched | | Rule suffix | IDENTIFIER_OR_KEYWORD _except `crate`_ | A modification to the previous rule | | // Comment. | // Single line comment. | A comment extending to the end of the line. | diff --git a/src/tokens.md b/src/tokens.md index 047afd76a6..d878eabfe2 100644 --- a/src/tokens.md +++ b/src/tokens.md @@ -115,7 +115,7 @@ r[lex.token.literal.suffix.syntax] ```grammar,lexer SUFFIX -> IDENTIFIER_OR_KEYWORD _except `_`_ -SUFFIX_NO_E -> SUFFIX _not beginning with `e` or `E`_ +SUFFIX_NO_E -> ![`e` `E`] SUFFIX ``` r[lex.token.literal.suffix.validity] @@ -253,8 +253,7 @@ r[lex.token.byte.syntax] BYTE_LITERAL -> `b'` ^ ( ASCII_FOR_CHAR | BYTE_ESCAPE ) `'` SUFFIX? -ASCII_FOR_CHAR -> - +ASCII_FOR_CHAR -> ![`'` `\` LF CR TAB] ASCII BYTE_ESCAPE -> `\x` HEX_DIGIT HEX_DIGIT @@ -272,8 +271,7 @@ r[lex.token.str-byte.syntax] BYTE_STRING_LITERAL -> `b"` ^ ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX? -ASCII_FOR_STRING -> - +ASCII_FOR_STRING -> ![`"` `\` CR] ASCII ``` r[lex.token.str-byte.intro] @@ -309,8 +307,7 @@ RAW_BYTE_STRING_CONTENT -> `"` ^ ASCII_FOR_RAW*? `"` | `#` RAW_BYTE_STRING_CONTENT `#` -ASCII_FOR_RAW -> - +ASCII_FOR_RAW -> !CR ASCII ``` r[lex.token.str-byte-raw.intro] @@ -559,10 +556,10 @@ r[lex.token.literal.float.syntax] FLOAT_LITERAL -> DEC_LITERAL (`.` DEC_LITERAL)? FLOAT_EXPONENT SUFFIX? | DEC_LITERAL `.` DEC_LITERAL SUFFIX_NO_E? - | DEC_LITERAL `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | DEC_LITERAL `.` !(`.` | `_` | XID_Start) FLOAT_EXPONENT -> - (`e`|`E`) (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)* + (`e`|`E`) ^ (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)* ``` r[lex.token.literal.float.form] @@ -608,13 +605,11 @@ r[lex.token.literal.reserved.syntax] RESERVED_NUMBER -> BIN_LITERAL [`2`-`9`] | OCT_LITERAL [`8`-`9`] - | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` !(`.` | `_` | XID_Start) | ( BIN_LITERAL | OCT_LITERAL ) (`e`|`E`) - | `0b` `_`* - | `0o` `_`* - | `0x` `_`* - | DEC_LITERAL ( `.` DEC_LITERAL )? (`e` | `E`) (`+` | `-`)? - + | `0b` `_`* !BIN_DIGIT + | `0o` `_`* !OCT_DIGIT + | `0x` `_`* !HEX_DIGIT ``` r[lex.token.literal.reserved.intro] @@ -657,16 +652,16 @@ r[lex.token.life.syntax] ```grammar,lexer LIFETIME_TOKEN -> RAW_LIFETIME - | `'` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + | `'` IDENTIFIER_OR_KEYWORD !`'` LIFETIME_OR_LABEL -> RAW_LIFETIME - | `'` NON_KEYWORD_IDENTIFIER _not immediately followed by `'`_ + | `'` NON_KEYWORD_IDENTIFIER !`'` RAW_LIFETIME -> - `'r#` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + `'r#` ^ IDENTIFIER_OR_KEYWORD !`'` -RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by `'`_ +RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) !(`'` | XID_Continue) ``` r[lex.token.life.intro] diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 6fbb886558..1d64e45143 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -51,6 +51,8 @@ pub enum ExpressionKind { Sequence(Vec), /// `A?` Optional(Box), + /// `!A` + NegativeLookahead(Box), /// `A*` Repeat(Box), /// `A*?` @@ -85,7 +87,7 @@ pub enum ExpressionKind { /// `^ A B C` Cut(Box), /// `U+0060` - Unicode(String), + Unicode((char, String)), } #[derive(Copy, Clone, Debug)] @@ -113,7 +115,34 @@ pub enum Characters { /// `` `_` `` Terminal(String), /// `` `A`-`Z` `` - Range(char, char), + Range(Character, Character), +} + +#[derive(Clone, Debug)] +pub enum Character { + Char(char), + /// `U+0060` + /// + /// The `String` is the hex digits after `U+`. + Unicode((char, String)), +} + +impl Character { + pub fn get_ch(&self) -> char { + match self { + Character::Char(ch) => *ch, + Character::Unicode((ch, _)) => *ch, + } + } +} + +impl Display for Character { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Character::Char(ch) => write!(f, "`{ch}`"), + Character::Unicode((_, s)) => write!(f, "U+{s}"), + } + } } impl Grammar { @@ -137,6 +166,7 @@ impl Expression { match &self.kind { ExpressionKind::Grouped(e) | ExpressionKind::Optional(e) + | ExpressionKind::NegativeLookahead(e) | ExpressionKind::Repeat(e) | ExpressionKind::RepeatNonGreedy(e) | ExpressionKind::RepeatPlus(e) diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index f65cb80f97..0db6b478b5 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -1,6 +1,6 @@ //! A parser of the ENBF-like grammar. -use super::{Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit}; +use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit}; use std::fmt; use std::fmt::Display; use std::path::Path; @@ -221,7 +221,7 @@ impl Parser<'_> { }; let kind = if self.take_str("U+") { - self.parse_unicode()? + ExpressionKind::Unicode(self.parse_unicode()?) } else if self.input[self.index..] .chars() .next() @@ -251,6 +251,8 @@ impl Parser<'_> { self.parse_grouped()? } else if next == b'~' { self.parse_neg_expression()? + } else if next == b'!' { + self.parse_negative_lookahead()? } else { return Ok(None); }; @@ -320,27 +322,19 @@ impl Parser<'_> { /// Parse an element of a character class, e.g. /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``. fn parse_characters(&mut self) -> Result> { - if let Some(b'`') = self.peek() { - let recov = self.index; - let a = self.parse_terminal_str()?; + if let Some(a) = self.parse_character()? { if self.take_str("-") { - //~^ Parse `` `a`-`b` `` character range. - if a.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid start terminal in range"); - } - let recov = self.index; - let b = self.parse_terminal_str()?; - if b.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid end terminal in range"); - } - let a = a.chars().next().unwrap(); - let b = b.chars().next().unwrap(); + let Some(b) = self.parse_character()? else { + bail!(self, "expected character in range"); + }; Ok(Some(Characters::Range(a, b))) } else { //~^ Parse terminal in backticks. - Ok(Some(Characters::Terminal(a))) + let t = match a { + Character::Char(ch) => ch.to_string(), + Character::Unicode(_) => bail!(self, "unicode not supported"), + }; + Ok(Some(Characters::Terminal(t))) } } else if let Some(name) = self.parse_name() { //~^ Parse nonterminal identifier. @@ -350,6 +344,23 @@ impl Parser<'_> { } } + fn parse_character(&mut self) -> Result> { + if let Some(b'`') = self.peek() { + let recov = self.index; + let term = self.parse_terminal_str()?; + if term.len() > 1 { + self.index = recov + 1; + bail!(self, "invalid start terminal in range"); + } + let ch = term.chars().next().unwrap(); + Ok(Some(Character::Char(ch))) + } else if self.take_str("U+") { + Ok(Some(Character::Unicode(self.parse_unicode()?))) + } else { + Ok(None) + } + } + /// Parse e.g. ``. fn parse_prose(&mut self) -> Result { self.expect("<", "expected opening `<`")?; @@ -387,10 +398,19 @@ impl Parser<'_> { Ok(ExpressionKind::NegExpression(box_kind(kind))) } + fn parse_negative_lookahead(&mut self) -> Result { + self.expect("!", "expected !")?; + self.space0(); + let Some(e) = self.parse_expr1()? else { + bail!(self, "expected expression after !"); + }; + Ok(ExpressionKind::NegativeLookahead(Box::new(e))) + } + /// Parse e.g. `F00F` after `U+`. - fn parse_unicode(&mut self) -> Result { - let mut xs = Vec::with_capacity(4); - for _ in 0..4 { + fn parse_unicode(&mut self) -> Result<(char, String)> { + let mut xs = Vec::with_capacity(6); + let mut push_next = || { match self.peek() { Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => { xs.push(x); @@ -398,8 +418,19 @@ impl Parser<'_> { } _ => bail!(self, "expected 4 uppercase hexadecimal digits after `U+`"), } + Ok(()) + }; + for _ in 0..4 { + push_next()?; + } + for _ in 0..2 { + if push_next().is_err() { + break; + } } - Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap())) + let s = String::from_utf8(xs).unwrap(); + let ch = char::from_u32(u32::from_str_radix(&s, 16).unwrap()).unwrap(); + Ok((ch, s)) } /// Parse `?` after expression. @@ -542,7 +573,7 @@ fn translate_position(input: &str, index: usize) -> (&str, usize, usize) { #[cfg(test)] mod tests { use crate::parser::{parse_grammar, translate_position}; - use crate::{ExpressionKind, Grammar, RangeLimit}; + use crate::{Character, Characters, ExpressionKind, Grammar, RangeLimit}; use std::path::Path; #[test] @@ -747,4 +778,333 @@ mod tests { assert_eq!(max, Some(1)); assert!(matches!(limit, RangeLimit::HalfOpen)); } + + // --- Negative lookahead tests --- + + #[test] + fn lookahead_simple_nonterminal() { + let input = "Rule -> !Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else { + panic!("expected NegativeLookahead, got {:?}", rule.expression.kind); + }; + assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "Foo")); + } + + #[test] + fn lookahead_terminal() { + let input = "Rule -> !`'` Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 2); + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + assert!(matches!(&inner.kind, ExpressionKind::Terminal(t) if t == "'")); + assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "Foo")); + } + + #[test] + fn lookahead_charset() { + let input = "Rule -> ![`e` `E`] SUFFIX"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 2); + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + let ExpressionKind::Charset(chars) = &inner.kind else { + panic!("expected Charset inside lookahead, got {:?}", inner.kind); + }; + assert_eq!(chars.len(), 2); + assert!(matches!(&chars[0], Characters::Terminal(t) if t == "e")); + assert!(matches!(&chars[1], Characters::Terminal(t) if t == "E")); + } + + #[test] + fn lookahead_grouped() { + let input = "Rule -> !(`.` | `_` | XID_Start)"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::NegativeLookahead(inner) = &rule.expression.kind else { + panic!("expected NegativeLookahead, got {:?}", rule.expression.kind); + }; + let ExpressionKind::Grouped(grouped) = &inner.kind else { + panic!("expected Grouped inside lookahead, got {:?}", inner.kind); + }; + let ExpressionKind::Alt(alts) = &grouped.kind else { + panic!("expected Alt inside Grouped, got {:?}", grouped.kind); + }; + assert_eq!(alts.len(), 3); + assert!(matches!(&alts[0].kind, ExpressionKind::Terminal(t) if t == ".")); + assert!(matches!(&alts[1].kind, ExpressionKind::Terminal(t) if t == "_")); + assert!(matches!(&alts[2].kind, ExpressionKind::Nt(n) if n == "XID_Start")); + } + + #[test] + fn lookahead_in_sequence_middle() { + let input = "Rule -> A !B C"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 3); + assert!(matches!(&seq[0].kind, ExpressionKind::Nt(n) if n == "A")); + let ExpressionKind::NegativeLookahead(inner) = &seq[1].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[1].kind); + }; + assert!(matches!(&inner.kind, ExpressionKind::Nt(n) if n == "B")); + assert!(matches!(&seq[2].kind, ExpressionKind::Nt(n) if n == "C")); + } + + #[test] + fn lookahead_in_repetition() { + let input = "Rule -> (!A B)*"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Repeat(rep) = &rule.expression.kind else { + panic!("expected Repeat, got {:?}", rule.expression.kind); + }; + let ExpressionKind::Grouped(grouped) = &rep.kind else { + panic!("expected Grouped inside Repeat, got {:?}", rep.kind); + }; + let ExpressionKind::Sequence(seq) = &grouped.kind else { + panic!("expected Sequence inside Grouped, got {:?}", grouped.kind); + }; + assert_eq!(seq.len(), 2); + assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_))); + assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B")); + } + + #[test] + fn lookahead_in_alternation() { + let input = "Rule -> !A B | C"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Alt(alts) = &rule.expression.kind else { + panic!("expected Alt, got {:?}", rule.expression.kind); + }; + assert_eq!(alts.len(), 2); + let ExpressionKind::Sequence(seq) = &alts[0].kind else { + panic!("expected Sequence, got {:?}", alts[0].kind); + }; + assert_eq!(seq.len(), 2); + assert!(matches!(&seq[0].kind, ExpressionKind::NegativeLookahead(_))); + assert!(matches!(&seq[1].kind, ExpressionKind::Nt(n) if n == "B")); + assert!(matches!(&alts[1].kind, ExpressionKind::Nt(n) if n == "C")); + } + + #[test] + fn lookahead_fail_trailing() { + let input = "Rule -> !"; + let err = parse(input).unwrap_err(); + assert!(err.contains("expected expression after !")); + } + + // --- Unicode tests --- + + #[test] + fn unicode_4_digit() { + let input = "Rule -> U+0009"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else { + panic!("expected Unicode, got {:?}", rule.expression.kind); + }; + assert_eq!(*ch, '\t'); + assert_eq!(s, "0009"); + } + + #[test] + fn unicode_5_digit() { + let input = "Rule -> U+E0000"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else { + panic!("expected Unicode, got {:?}", rule.expression.kind); + }; + assert_eq!(*ch, '\u{E0000}'); + assert_eq!(s, "E0000"); + } + + #[test] + fn unicode_6_digit() { + let input = "Rule -> U+10FFFF"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Unicode((ch, s)) = &rule.expression.kind else { + panic!("expected Unicode, got {:?}", rule.expression.kind); + }; + assert_eq!(*ch, '\u{10FFFF}'); + assert_eq!(s, "10FFFF"); + } + + #[test] + fn unicode_in_alternation() { + let input = "Rule -> U+0009 | U+000A"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Alt(alts) = &rule.expression.kind else { + panic!("expected Alt, got {:?}", rule.expression.kind); + }; + assert_eq!(alts.len(), 2); + assert!(matches!( + &alts[0].kind, + ExpressionKind::Unicode((ch, _)) if *ch == '\t' + )); + assert!(matches!( + &alts[1].kind, + ExpressionKind::Unicode((ch, _)) if *ch == '\n' + )); + } + + // --- Character / charset range tests --- + + #[test] + fn charset_unicode_range() { + let input = "Rule -> [U+0000-U+007F]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); + assert!(matches!( + b, + Character::Unicode((ch, _)) if *ch == '\u{7F}' + )); + } + + #[test] + fn charset_char_range() { + let input = "Rule -> [`a`-`z`]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Char(ch) if *ch == 'a')); + assert!(matches!(b, Character::Char(ch) if *ch == 'z')); + } + + #[test] + fn charset_mixed_range() { + let input = "Rule -> [`a`-U+007A]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Char(ch) if *ch == 'a')); + assert!(matches!( + b, + Character::Unicode((ch, _)) if *ch == 'z' + )); + } + + #[test] + fn charset_multiple_unicode_ranges() { + let input = "Rule -> [U+0000-U+D7FF U+E000-U+10FFFF]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 2); + let Characters::Range(a1, b1) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a1, Character::Unicode((ch, _)) if *ch == '\0')); + assert!(matches!(b1, Character::Unicode((ch, _)) if *ch == '\u{D7FF}')); + let Characters::Range(a2, b2) = &chars[1] else { + panic!("expected Range, got {:?}", chars[1]); + }; + assert!(matches!(a2, Character::Unicode((ch, _)) if *ch == '\u{E000}')); + assert!(matches!(b2, Character::Unicode((ch, _)) if *ch == '\u{10FFFF}')); + } + + #[test] + fn charset_terminals_and_named() { + let input = "Rule -> [`a` `b` Foo]"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Charset(chars) = &rule.expression.kind else { + panic!("expected Charset, got {:?}", rule.expression.kind); + }; + assert_eq!(chars.len(), 3); + assert!(matches!(&chars[0], Characters::Terminal(t) if t == "a")); + assert!(matches!(&chars[1], Characters::Terminal(t) if t == "b")); + assert!(matches!(&chars[2], Characters::Named(n) if n == "Foo")); + } + + // --- Negative lookahead combined with charset --- + + #[test] + fn lookahead_charset_with_named_and_terminals() { + // Pattern from tokens.md: ![`'` `\` LF CR TAB] ASCII + let input = "Rule -> ![`x` `y` LF] Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + assert_eq!(seq.len(), 2); + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + let ExpressionKind::Charset(chars) = &inner.kind else { + panic!("expected Charset, got {:?}", inner.kind); + }; + assert_eq!(chars.len(), 3); + assert!(matches!(&chars[0], Characters::Terminal(t) if t == "x")); + assert!(matches!(&chars[1], Characters::Terminal(t) if t == "y")); + assert!(matches!(&chars[2], Characters::Named(n) if n == "LF")); + } + + // --- Negative lookahead combined with Unicode --- + + #[test] + fn lookahead_charset_with_unicode_range() { + let input = "Rule -> ![U+0000-U+007F] Foo"; + let grammar = parse(input).unwrap(); + let rule = grammar.productions.get("Rule").unwrap(); + let ExpressionKind::Sequence(seq) = &rule.expression.kind else { + panic!("expected Sequence, got {:?}", rule.expression.kind); + }; + let ExpressionKind::NegativeLookahead(inner) = &seq[0].kind else { + panic!("expected NegativeLookahead, got {:?}", seq[0].kind); + }; + let ExpressionKind::Charset(chars) = &inner.kind else { + panic!("expected Charset, got {:?}", inner.kind); + }; + assert_eq!(chars.len(), 1); + let Characters::Range(a, b) = &chars[0] else { + panic!("expected Range, got {:?}", chars[0]); + }; + assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); + assert!(matches!( + b, + Character::Unicode((ch, _)) if *ch == '\u{7F}' + )); + } } diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index edec8da035..316eb9aaf3 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production}; use regex::Regex; use std::borrow::Cow; use std::fmt::Write; @@ -67,6 +67,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => last_expr(es.last().unwrap()), ExpressionKind::Grouped(_) | ExpressionKind::Optional(_) + | ExpressionKind::NegativeLookahead(_) | ExpressionKind::Repeat(_) | ExpressionKind::RepeatNonGreedy(_) | ExpressionKind::RepeatPlus(_) @@ -119,6 +120,10 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { render_expression(e, cx, output); output.push_str("?"); } + ExpressionKind::NegativeLookahead(e) => { + output.push('!'); + render_expression(e, cx, output); + } ExpressionKind::Repeat(e) => { render_expression(e, cx, output); output.push_str("\\*"); @@ -181,7 +186,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { output.push_str("^ "); render_expression(e, cx, output); } - ExpressionKind::Unicode(s) => { + ExpressionKind::Unicode((_, s)) => { output.push_str("U+"); output.push_str(s); } @@ -222,12 +227,20 @@ fn render_characters(chars: &Characters, cx: &RenderCtx, output: &mut String) { markdown_escape(s) ) .unwrap(), - Characters::Range(a, b) => write!( - output, - "{a}\ - -{b}" - ) - .unwrap(), + Characters::Range(a, b) => { + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => write!( + output, + "{}", + markdown_escape(&ch.to_string()) + ) + .unwrap(), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, output); + output.push('-'); + write_ch(b, output); + } } } @@ -237,3 +250,181 @@ fn markdown_escape(s: &str) -> Cow<'_, str> { LazyLock::new(|| Regex::new(r#"[\\`_*\[\](){}'".-]"#).unwrap()); ESC_RE.replace_all(s, r"\$0") } + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + /// Creates a minimal `RenderCtx` for testing. + fn test_cx() -> RenderCtx { + RenderCtx { + md_link_map: HashMap::new(), + rr_link_map: HashMap::new(), + for_summary: false, + } + } + + /// Renders a single expression to a markdown string. + fn render(kind: ExpressionKind) -> String { + let cx = test_cx(); + let expr = Expression::new_kind(kind); + let mut output = String::new(); + render_expression(&expr, &cx, &mut output); + output + } + + // -- Negative lookahead tests -- + + #[test] + fn lookahead_nonterminal() { + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Nt("CHAR".to_string())), + ))); + assert!(result.contains("!"), "should contain `!` prefix"); + assert!( + result.contains("CHAR"), + "should contain the nonterminal name" + ); + } + + #[test] + fn lookahead_terminal() { + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Terminal("'".to_string())), + ))); + assert!(result.starts_with("!"), "should start with `!`"); + assert!( + result.contains("grammar-literal"), + "should render inner terminal as a grammar literal" + ); + } + + #[test] + fn lookahead_charset() { + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![ + Characters::Terminal("e".to_string()), + Characters::Terminal("E".to_string()), + ])), + ))); + assert!(result.starts_with("!"), "should start with `!`"); + assert!( + result.contains("\\["), + "should contain escaped opening bracket for charset" + ); + } + + #[test] + fn lookahead_grouped() { + // !( `.` | `_` ) + let inner = + ExpressionKind::Grouped(Box::new(Expression::new_kind(ExpressionKind::Alt(vec![ + Expression::new_kind(ExpressionKind::Terminal(".".to_string())), + Expression::new_kind(ExpressionKind::Terminal("_".to_string())), + ])))); + let result = render(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(inner), + ))); + assert!(result.starts_with("!(")); + assert!(result.contains("|")); + } + + // -- Unicode tests -- + + #[test] + fn unicode_4_digit() { + let result = render(ExpressionKind::Unicode(('\t', "0009".to_string()))); + assert_eq!(result, "U+0009"); + } + + #[test] + fn unicode_6_digit() { + let result = render(ExpressionKind::Unicode(( + '\u{10FFFF}', + "10FFFF".to_string(), + ))); + assert_eq!(result, "U+10FFFF"); + } + + // -- Charset with Unicode range tests -- + + #[test] + fn charset_unicode_range() { + let result = render(ExpressionKind::Charset(vec![Characters::Range( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + )])); + assert!(result.contains("\\[")); + assert!(result.contains("U+0000")); + assert!(result.contains("U+007F")); + assert!(result.contains("-")); + } + + #[test] + fn charset_char_range() { + let result = render(ExpressionKind::Charset(vec![Characters::Range( + Character::Char('a'), + Character::Char('z'), + )])); + assert!(result.contains("\\[")); + assert!(result.contains("grammar-literal")); + assert!(result.contains("-")); + } + + #[test] + fn charset_mixed_range() { + // [`a`-U+007A] + let result = render(ExpressionKind::Charset(vec![Characters::Range( + Character::Char('a'), + Character::Unicode(('\u{007A}', "007A".to_string())), + )])); + assert!(result.contains("grammar-literal")); + assert!(result.contains("U+007A")); + assert!(result.contains("-")); + } + + // -- Cut test -- + + #[test] + fn cut_rendering() { + let result = render(ExpressionKind::Cut(Box::new(Expression::new_kind( + ExpressionKind::Nt("Foo".to_string()), + )))); + assert!(result.starts_with("^ "), "cut should render as `^ ` prefix"); + assert!(result.contains("Foo")); + } + + // -- NegExpression test -- + + #[test] + fn neg_expression_rendering() { + let result = render(ExpressionKind::NegExpression(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( + "a".to_string(), + )])), + ))); + assert!( + result.starts_with("~"), + "neg expression should render as `~` prefix" + ); + } + + // -- Markdown escape tests -- + + #[test] + fn markdown_escape_backtick() { + assert_eq!(markdown_escape("`"), "\\`"); + } + + #[test] + fn markdown_escape_brackets() { + assert_eq!(markdown_escape("["), "\\["); + assert_eq!(markdown_escape("]"), "\\]"); + } + + #[test] + fn markdown_escape_plain() { + assert_eq!(markdown_escape("abc"), "abc"); + } +} diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index ebb20af1bc..ad7b291e57 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production, RangeLimit}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production, RangeLimit}; use railroad::*; use regex::Regex; use std::fmt::Write; @@ -143,6 +143,12 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option { + let forward = render_expression(e, cx, stack)?; + let lbox = + LabeledBox::new(forward, Comment::new("not followed by".to_string())); + Box::new(lbox) + } // Treat `e?` and `e{..=1}` / `e{0..=1}` equally. ExpressionKind::Optional(e) | ExpressionKind::RepeatRange { @@ -292,7 +298,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option Box::new(Terminal::new(format!("U+{}", s))), + ExpressionKind::Unicode((_, s)) => Box::new(Terminal::new(format!("U+{}", s))), }; } }; @@ -311,7 +317,17 @@ fn render_characters(chars: &Characters, cx: &RenderCtx) -> Box { match chars { Characters::Named(s) => node_for_nt(cx, s), Characters::Terminal(s) => Box::new(Terminal::new(s.clone())), - Characters::Range(a, b) => Box::new(Terminal::new(format!("{a}-{b}"))), + Characters::Range(a, b) => { + let mut s = String::new(); + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => output.push(*ch), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, &mut s); + s.push('-'); + write_ch(b, &mut s); + Box::new(Terminal::new(s)) + } } } @@ -375,7 +391,7 @@ impl Node for Except { #[cfg(test)] mod tests { use super::*; - use grammar::{Expression, ExpressionKind, RangeLimit}; + use grammar::{Character, Characters, Expression, ExpressionKind, RangeLimit}; /// Render an expression to an SVG string fragment. fn render_to_svg(expr: &Expression) -> Option { @@ -395,6 +411,8 @@ mod tests { }) } + // -- RepeatRange tests -- + #[test] fn test_empty_exclusive_equal() { // `e{2..2}` (half-open, min == max) renders as empty. @@ -460,4 +478,117 @@ mod tests { "expected nonterminal for e{{..=1}}, got: {svg}" ); } + + // -- Negative lookahead tests -- + + #[test] + fn lookahead_nonterminal() { + let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Nt("CHAR".to_string())), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!( + svg.contains("not followed by"), + "should contain the 'not followed by' label" + ); + assert!(svg.contains("CHAR"), "should contain the nonterminal name"); + } + + #[test] + fn lookahead_terminal() { + let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Terminal("CR".to_string())), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("not followed by")); + assert!(svg.contains("CR")); + } + + #[test] + fn lookahead_charset() { + let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![ + Characters::Terminal("e".to_string()), + Characters::Terminal("E".to_string()), + ])), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("not followed by")); + assert!(svg.contains("e")); + assert!(svg.contains("E")); + } + + // -- Unicode tests -- + + #[test] + fn unicode_4_digit() { + let expr = Expression::new_kind(ExpressionKind::Unicode(('\t', "0009".to_string()))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("U+0009"), "should render Unicode code point"); + } + + #[test] + fn unicode_6_digit() { + let expr = Expression::new_kind(ExpressionKind::Unicode(( + '\u{10FFFF}', + "10FFFF".to_string(), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("U+10FFFF")); + } + + // -- Charset with ranges -- + + #[test] + fn charset_unicode_range() { + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + )])); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("U+0000")); + assert!(svg.contains("U+007F")); + } + + #[test] + fn charset_char_range() { + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( + Character::Char('a'), + Character::Char('z'), + )])); + let svg = render_to_svg(&expr).unwrap(); + assert!(svg.contains("a")); + assert!(svg.contains("z")); + } + + // -- Cut test -- + + #[test] + fn cut_rendering() { + let expr = Expression::new_kind(ExpressionKind::Cut(Box::new(Expression::new_kind( + ExpressionKind::Nt("Foo".to_string()), + )))); + let svg = render_to_svg(&expr).unwrap(); + assert!( + svg.contains("no backtracking"), + "cut should render with 'no backtracking' label" + ); + assert!(svg.contains("Foo")); + } + + // -- NegExpression test -- + + #[test] + fn neg_expression_rendering() { + let expr = Expression::new_kind(ExpressionKind::NegExpression(Box::new( + Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( + "a".to_string(), + )])), + ))); + let svg = render_to_svg(&expr).unwrap(); + assert!( + svg.contains("with the exception of"), + "neg expression should have exception label" + ); + } }