Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(parse/md): markdown header support in lexer #5208

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions crates/biome_markdown_factory/src/generated/node_factory.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

198 changes: 198 additions & 0 deletions crates/biome_markdown_factory/src/generated/syntax_factory.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 32 additions & 2 deletions crates/biome_markdown_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
//! An extremely fast, lookup table based, JSON lexer which yields SyntaxKind tokens used by the rome-json parser.
#[rustfmt::skip]
mod tests;

use biome_markdown_syntax::MarkdownSyntaxKind;
use biome_markdown_syntax::MarkdownSyntaxKind::*;
use biome_markdown_syntax::T;
use biome_parser::diagnostic::ParseDiagnostic;
use biome_parser::lexer::{
LexContext, Lexer, LexerCheckpoint, LexerWithCheckpoint, ReLexer, TokenFlags,
@@ -181,10 +180,25 @@ impl<'src> MarkdownLexer<'src> {
match dispatched {
WHS => self.consume_newline_or_whitespace(),
MUL | MIN | IDT => self.consume_thematic_break_literal(),
HAS => self.consume_header(),
_ => self.consume_textual(),
}
}

fn consume_header(&mut self) -> MarkdownSyntaxKind {
self.assert_at_char_boundary();

// Just consume a single hash character and return its token
if matches!(self.current_byte(), Some(b'#')) {
self.advance(1);
return T![#];
}

// This shouldn't be reached if this function is called correctly
// but handle the error case anyway
self.consume_textual()
}

fn text_position(&self) -> TextSize {
TextSize::try_from(self.position).expect("Input to be smaller than 4 GB")
}
@@ -356,9 +370,25 @@ impl<'src> MarkdownLexer<'src> {
fn consume_textual(&mut self) -> MarkdownSyntaxKind {
self.assert_at_char_boundary();

// Consume the first character
let char = self.current_char_unchecked();
self.advance(char.len_utf8());

// Continue consuming characters until we hit a newline or another special markdown character
// But allow spaces within text content
while let Some(byte) = self.current_byte() {
match byte {
// Stop at newlines or special Markdown syntax characters,
// but NOT spaces (removed b' ' from this list)
b'\n' | b'\r' | b'\t' | b'#' | b'*' | b'-' | b'_' => break,
_ => {
// Consume this character and continue
let next_char = self.current_char_unchecked();
self.advance(next_char.len_utf8());
}
}
}

MD_TEXTUAL_LITERAL
}

115 changes: 115 additions & 0 deletions crates/biome_markdown_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
@@ -25,6 +25,9 @@ macro_rules! assert_lex {
tokens.push((lexer.current(), lexer.current_range()));
}

// TODO: remove this debug print
println!("tokens: {:#?}", tokens);

$(
assert_eq!(
tokens[idx].0,
@@ -140,6 +143,118 @@ fn whitespace() {
}
}

#[test]
fn heading_level_1() {
assert_lex! {
"# Heading 1",
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:9,
}
}

#[test]
fn heading_level_1_with_newline() {
assert_lex! {
"# Heading 1\n",
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:9,
NEWLINE:1,
}
}

#[test]
fn heading_level_2() {
assert_lex! {
"## Heading 2",
HASH:1,
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:9,
}
}

#[test]
fn heading_level_3() {
assert_lex! {
"### Heading 3",
HASH:1,
HASH:1,
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:9,
}
}

#[test]
fn heading_level_4() {
assert_lex! {
"#### Heading 4",
HASH:1,
HASH:1,
HASH:1,
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:9,
}
}

#[test]
fn heading_level_5() {
assert_lex! {
"##### Heading 5",
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:9,
}
}

#[test]
fn heading_level_6() {
assert_lex! {
"###### Heading 6",
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:9,
}
}

#[test]
// todo: this should be a MD_TEXTUAL_LITERAL token
fn not_a_heading() {
assert_lex! {
"############## not-heading",
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
HASH:1,
WHITESPACE:1,
MD_TEXTUAL_LITERAL:3,
ERROR_TOKEN:1,
MD_TEXTUAL_LITERAL:7,
}
}

#[test]
fn thematic_break_literal() {
assert_lex! {
49 changes: 44 additions & 5 deletions crates/biome_markdown_parser/src/syntax.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
pub mod thematic_break_block;
pub mod atx_headings;

use biome_markdown_syntax::{T, kind::MarkdownSyntaxKind::*};
use biome_parser::{
prelude::{ParsedSyntax::{self, *}, TokenSource},
Parser,
prelude::ParsedSyntax::{self, *},
};
use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block};
use atx_headings::{at_atx_heading, parse_atx_heading};

use crate::MarkdownParser;

@@ -25,7 +27,9 @@ pub(crate) fn parse_block_list(p: &mut MarkdownParser) -> ParsedSyntax {
}

pub(crate) fn parse_any_block(p: &mut MarkdownParser) {
if at_indent_code_block(p) {
if at_atx_heading(p) {
let _ = parse_atx_heading(p);
} else if at_indent_code_block(p) {
parse_indent_code_block(p);
} else if at_thematic_break_block(p) {
let break_block = try_parse(p, |p| {
@@ -36,8 +40,10 @@ pub(crate) fn parse_any_block(p: &mut MarkdownParser) {
Ok(break_block)
});
if break_block.is_err() {
parse_paragraph(p);
let _ = parse_paragraph(p);
}
} else {
let _ = parse_paragraph(p);
}
}

@@ -49,8 +55,41 @@ pub(crate) fn parse_indent_code_block(_p: &mut MarkdownParser) {
todo!()
}

pub(crate) fn parse_paragraph(_p: &mut MarkdownParser) {
todo!()
pub(crate) fn parse_paragraph(p: &mut MarkdownParser) -> ParsedSyntax {
let m = p.start();

// Parse paragraph content until a blank line, EOF, or another block element
parse_paragraph_line(p);

// Additional lines in the paragraph
while !p.at(T![EOF]) &&
!is_blank_line(p) &&
!at_atx_heading(p) &&
!at_thematic_break_block(p) &&
!at_indent_code_block(p) {
parse_paragraph_line(p);
}

Present(m.complete(p, MD_PARAGRAPH))
}

// Helper to check if we're at a blank line
fn is_blank_line(p: &mut MarkdownParser) -> bool {
// A simple check for a blank line - just newline or whitespace followed by newline
p.at(NEWLINE) || (p.at(WHITESPACE) && p.nth(1) == NEWLINE)
}

// Renamed to be clearer that this parses a single line of paragraph content
pub(crate) fn parse_paragraph_line(p: &mut MarkdownParser) {
// Parse until end of line or end of file
while !p.at(T![EOF]) && !p.at(NEWLINE) {
p.bump(p.source().current());
}

// Consume the newline if present
if p.at(NEWLINE) {
p.bump(NEWLINE);
}
}

/// Attempt to parse some input with the given parsing function. If parsing
85 changes: 85 additions & 0 deletions crates/biome_markdown_parser/src/syntax/atx_headings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use crate::parser::MarkdownParser;
use biome_markdown_syntax::MarkdownSyntaxKind::*;
use biome_markdown_syntax::T;
use biome_parser::{
prelude::ParsedSyntax::{self, *},
Parser,
prelude::TokenSource,
};

pub(crate) fn at_atx_heading(p: &mut MarkdownParser) -> bool {
// ATX headings start with 1-6 hash characters
if !p.at(T![#]) {
return false;
}

// Count consecutive hash characters (max 6)
let mut hash_count = 0;
while p.nth(hash_count) == T![#] && hash_count < 6 {
hash_count += 1;
}

// Must be followed by whitespace or EOL to be a valid heading
let next = p.nth(hash_count);
next == WHITESPACE || next == NEWLINE || next == T![EOF]
}

pub(crate) fn parse_atx_heading(p: &mut MarkdownParser) -> ParsedSyntax {
let m = p.start();

// Parse opening hash marks
let hash_list_m = p.start();
let mut hash_count = 0;
while p.at(T![#]) && hash_count < 6 {
p.bump(T![#]);
hash_count += 1;
}
hash_list_m.complete(p, MD_HASH_LIST);

// Skip whitespace after the hash marks
if p.at(WHITESPACE) {
p.bump(WHITESPACE);
}
Comment on lines +39 to +42
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, I think the whitespace is required for it to become a heading. Do you have a source for this behavior?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in this example space is skip https://spec.commonmark.org/0.31.2/#example-62
in this example spaces is also skiped https://spec.commonmark.org/0.31.2/#example-62

Copy link
Contributor

@dyc3 dyc3 Mar 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At least one space or tab is required between the # characters and the heading’s contents, unless the heading is empty. Note that many implementations currently do not require the space.

The code here makes the whitespace optional when it is actually required. See example 64 in that doc


// Parse heading content as a paragraph (optional)
if !p.at(NEWLINE) && !p.at(T![EOF]) {
let paragraph_m = p.start();

// Parse until end of line, or until trailing hashes
while !p.at(NEWLINE) && !p.at(T![EOF]) && !p.at(T![#]) {
p.bump(p.source().current());
}

paragraph_m.complete(p, MD_PARAGRAPH);
}

// Parse trailing hash marks (optional)
let trailing_hash_list_m = p.start();
while p.at(T![#]) {
p.bump(T![#]);
}
trailing_hash_list_m.complete(p, MD_HASH_LIST);

// Skip trailing whitespace
if p.at(WHITESPACE) {
p.bump(WHITESPACE);
}

// Consume the newline if present
if p.at(NEWLINE) {
p.bump(NEWLINE);
}

// Complete with the appropriate heading node type based on hash count
let node = match hash_count {
1 => MD_HEADER1,
2 => MD_HEADER2,
3 => MD_HEADER3,
4 => MD_HEADER4,
5 => MD_HEADER5,
6 => MD_HEADER6,
_ => MD_HEADER // Fallback, should not happen
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be a parsing error instead?

Copy link
Member

@ematipico ematipico Feb 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Markdown, I think, shouldn't have parsing errors. What I mean is that at the end, the language is very lax and, worst case scenario, a paragraph is always emitted.

I've never seen an editor emitting a parsing error 🤔

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part of checking if it's valid ATX heading or just paragphe is done in LEXER but rust ask us for fallback

};

Present(m.complete(p, node))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Heading 1
## Heading 2
### Heading 3
#### Heading 4
##### Heading 5
###### Heading 6
Empty file.
31 changes: 31 additions & 0 deletions crates/biome_markdown_parser/tests/md_test_suite/ok/empty.md.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
---
source: crates/biome_markdown_parser/tests/spec_test.rs
expression: snapshot
snapshot_kind: text
---
## Input

```
```


## AST

```
MdDocument {
bom_token: missing (optional),
value: MdBlockList [],
eof_token: EOF@0..0 "" [] [],
}
```

## CST

```
0: MD_DOCUMENT@0..0
0: (empty)
1: MD_BLOCK_LIST@0..0
2: EOF@0..0 "" [] []
```
6 changes: 6 additions & 0 deletions crates/biome_markdown_syntax/src/generated/kind.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions crates/biome_markdown_syntax/src/generated/macros.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

612 changes: 612 additions & 0 deletions crates/biome_markdown_syntax/src/generated/nodes.rs

Large diffs are not rendered by default.

120 changes: 120 additions & 0 deletions crates/biome_markdown_syntax/src/generated/nodes_mut.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions xtask/codegen/markdown.ungram
Original file line number Diff line number Diff line change
@@ -58,6 +58,13 @@ AnyContainerBlock = MdQuote | MdBulletListItem | MdOrderListItem
// h1..h6
MdHeader = before:MdHashList MdParagraph? after:MdHashList

MdHeader1 = before:MdHashList MdParagraph? after:MdHashList
MdHeader2 = before:MdHashList MdParagraph? after:MdHashList
MdHeader3 = before:MdHashList MdParagraph? after:MdHashList
MdHeader4 = before:MdHashList MdParagraph? after:MdHashList
MdHeader5 = before:MdHashList MdParagraph? after:MdHashList
MdHeader6 = before:MdHashList MdParagraph? after:MdHashList
Comment on lines 59 to +66
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change doesn't quite look right to me. We already have MdHeader defined above, but you've added 6 new nodes for headers. IMO, we should rename MdHeader into AnyMdHeader, and have it be a union of all the other header levels.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, but how should we represent the level of the heading ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having MdHeader1 through MdHeader6 is fine. I'm saying we should change MdHeader to:

AnyMdHeader = MdHeader1 | MdHeader2 | MdHeader3 | MdHeader4 | MdHeader5 | MdHeader6


MdHashList = MdHash*

MdHash = '#'
6 changes: 6 additions & 0 deletions xtask/codegen/src/markdown_kinds_src.rs
Original file line number Diff line number Diff line change
@@ -41,6 +41,12 @@ pub const MARKDOWN_KINDS_SRC: KindsSrc = KindsSrc {
"MD_HASH_LIST",
"MD_HASH",
"MD_HEADER",
"MD_HEADER1",
"MD_HEADER2",
"MD_HEADER3",
"MD_HEADER4",
"MD_HEADER5",
"MD_HEADER6",
"MD_INDENT_CODE_BLOCK",
"MD_FENCED_CODE_BLOCK",
"MD_HTML_BLOCK",