biomejs · AugustinMauroy · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 28, 2025
diff --git a/crates/biome_markdown_factory/src/generated/node_factory.rs b/crates/biome_markdown_factory/src/generated/node_factory.rs
diff --git a/crates/biome_markdown_factory/src/generated/syntax_factory.rs b/crates/biome_markdown_factory/src/generated/syntax_factory.rs
diff --git a/crates/biome_markdown_parser/src/lexer/mod.rs b/crates/biome_markdown_parser/src/lexer/mod.rs
@@ -1,10 +1,9 @@
-//! An extremely fast, lookup table based, JSON lexer which yields SyntaxKind tokens used by the rome-json parser.
-
 #[rustfmt::skip]
 mod tests;
 
 use biome_markdown_syntax::MarkdownSyntaxKind;
 use biome_markdown_syntax::MarkdownSyntaxKind::*;
+use biome_markdown_syntax::T;
 use biome_parser::diagnostic::ParseDiagnostic;
 use biome_parser::lexer::{
     LexContext, Lexer, LexerCheckpoint, LexerWithCheckpoint, ReLexer, TokenFlags,
@@ -181,10 +180,25 @@ impl<'src> MarkdownLexer<'src> {
         match dispatched {
             WHS => self.consume_newline_or_whitespace(),
             MUL | MIN | IDT => self.consume_thematic_break_literal(),
+            HAS => self.consume_header(),
             _ => self.consume_textual(),
         }
     }
 
+    fn consume_header(&mut self) -> MarkdownSyntaxKind {
+        self.assert_at_char_boundary();
+
+        // Just consume a single hash character and return its token
+        if matches!(self.current_byte(), Some(b'#')) {
+            self.advance(1);
+            return T![#];
+        }
+
+        // This shouldn't be reached if this function is called correctly
+        // but handle the error case anyway
+        self.consume_textual()
+    }
+
     fn text_position(&self) -> TextSize {
         TextSize::try_from(self.position).expect("Input to be smaller than 4 GB")
     }
@@ -356,9 +370,25 @@ impl<'src> MarkdownLexer<'src> {
     fn consume_textual(&mut self) -> MarkdownSyntaxKind {
         self.assert_at_char_boundary();
 
+        // Consume the first character
         let char = self.current_char_unchecked();
         self.advance(char.len_utf8());
 
+        // Continue consuming characters until we hit a newline or another special markdown character
+        // But allow spaces within text content
+        while let Some(byte) = self.current_byte() {
+            match byte {
+                // Stop at newlines or special Markdown syntax characters,
+                // but NOT spaces (removed b' ' from this list)
+                b'\n' | b'\r' | b'\t' | b'#' | b'*' | b'-' | b'_' => break,
+                _ => {
+                    // Consume this character and continue
+                    let next_char = self.current_char_unchecked();
+                    self.advance(next_char.len_utf8());
+                }
+            }
+        }
+
         MD_TEXTUAL_LITERAL
     }
 

diff --git a/crates/biome_markdown_parser/src/lexer/tests.rs b/crates/biome_markdown_parser/src/lexer/tests.rs
@@ -25,6 +25,9 @@ macro_rules! assert_lex {
             tokens.push((lexer.current(), lexer.current_range()));
         }
 
+        // TODO: remove this debug print
+        println!("tokens: {:#?}", tokens);
+
         $(
             assert_eq!(
                 tokens[idx].0,
@@ -140,6 +143,118 @@ fn whitespace() {
     }
 }
 
+#[test]
+fn heading_level_1() {
+    assert_lex! {
+        "# Heading 1",
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:9,
+    }
+}
+
+#[test]
+fn heading_level_1_with_newline() {
+    assert_lex! {
+        "# Heading 1\n",
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:9,
+        NEWLINE:1,
+    }
+}
+
+#[test]
+fn heading_level_2() {
+    assert_lex! {
+        "## Heading 2",
+        HASH:1,
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:9,
+    }
+}
+
+#[test]
+fn heading_level_3() {
+    assert_lex! {
+        "### Heading 3",
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:9,
+    }
+}
+
+#[test]
+fn heading_level_4() {
+    assert_lex! {
+        "#### Heading 4",
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:9,
+    }
+}
+
+#[test]
+fn heading_level_5() {
+    assert_lex! {
+        "##### Heading 5",
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:9,
+    }
+}
+
+#[test]
+fn heading_level_6() {
+    assert_lex! {
+        "###### Heading 6",
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:9,
+    }
+}
+
+#[test]
+// todo: this should be a MD_TEXTUAL_LITERAL token
+fn not_a_heading() {
+    assert_lex! {
+        "############## not-heading",
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        HASH:1,
+        WHITESPACE:1,
+        MD_TEXTUAL_LITERAL:3,
+        ERROR_TOKEN:1,
+        MD_TEXTUAL_LITERAL:7,
+    }
+}
+
 #[test]
 fn thematic_break_literal() {
     assert_lex! {

diff --git a/crates/biome_markdown_parser/src/syntax.rs b/crates/biome_markdown_parser/src/syntax.rs
@@ -1,11 +1,13 @@
 pub mod thematic_break_block;
+pub mod atx_headings;
 
 use biome_markdown_syntax::{T, kind::MarkdownSyntaxKind::*};
 use biome_parser::{
+    prelude::{ParsedSyntax::{self, *}, TokenSource},
     Parser,
-    prelude::ParsedSyntax::{self, *},
 };
 use thematic_break_block::{at_thematic_break_block, parse_thematic_break_block};
+use atx_headings::{at_atx_heading, parse_atx_heading};
 
 use crate::MarkdownParser;
 
@@ -25,7 +27,9 @@ pub(crate) fn parse_block_list(p: &mut MarkdownParser) -> ParsedSyntax {
 }
 
 pub(crate) fn parse_any_block(p: &mut MarkdownParser) {
-    if at_indent_code_block(p) {
+    if at_atx_heading(p) {
+        let _ = parse_atx_heading(p);
+    } else if at_indent_code_block(p) {
         parse_indent_code_block(p);
     } else if at_thematic_break_block(p) {
         let break_block = try_parse(p, |p| {
@@ -36,8 +40,10 @@ pub(crate) fn parse_any_block(p: &mut MarkdownParser) {
             Ok(break_block)
         });
         if break_block.is_err() {
-            parse_paragraph(p);
+            let _ = parse_paragraph(p);
         }
+    } else {
+        let _ = parse_paragraph(p);
     }
 }
 
@@ -49,8 +55,41 @@ pub(crate) fn parse_indent_code_block(_p: &mut MarkdownParser) {
     todo!()
 }
 
-pub(crate) fn parse_paragraph(_p: &mut MarkdownParser) {
-    todo!()
+pub(crate) fn parse_paragraph(p: &mut MarkdownParser) -> ParsedSyntax {
+    let m = p.start();
+
+    // Parse paragraph content until a blank line, EOF, or another block element
+    parse_paragraph_line(p);
+
+    // Additional lines in the paragraph
+    while !p.at(T![EOF]) &&
+          !is_blank_line(p) &&
+          !at_atx_heading(p) &&
+          !at_thematic_break_block(p) &&
+          !at_indent_code_block(p) {
+        parse_paragraph_line(p);
+    }
+
+    Present(m.complete(p, MD_PARAGRAPH))
+}
+
+// Helper to check if we're at a blank line
+fn is_blank_line(p: &mut MarkdownParser) -> bool {
+    // A simple check for a blank line - just newline or whitespace followed by newline
+    p.at(NEWLINE) || (p.at(WHITESPACE) && p.nth(1) == NEWLINE)
+}
+
+// Renamed to be clearer that this parses a single line of paragraph content
+pub(crate) fn parse_paragraph_line(p: &mut MarkdownParser) {
+    // Parse until end of line or end of file
+    while !p.at(T![EOF]) && !p.at(NEWLINE) {
+        p.bump(p.source().current());
+    }
+
+    // Consume the newline if present
+    if p.at(NEWLINE) {
+        p.bump(NEWLINE);
+    }
 }
 
 /// Attempt to parse some input with the given parsing function. If parsing

diff --git a/crates/biome_markdown_parser/src/syntax/atx_headings.rs b/crates/biome_markdown_parser/src/syntax/atx_headings.rs
@@ -0,0 +1,85 @@
+use crate::parser::MarkdownParser;
+use biome_markdown_syntax::MarkdownSyntaxKind::*;
+use biome_markdown_syntax::T;
+use biome_parser::{
+    prelude::ParsedSyntax::{self, *},
+    Parser,
+    prelude::TokenSource,
+};
+
+pub(crate) fn at_atx_heading(p: &mut MarkdownParser) -> bool {
+    // ATX headings start with 1-6 hash characters
+    if !p.at(T![#]) {
+        return false;
+    }
+
+    // Count consecutive hash characters (max 6)
+    let mut hash_count = 0;
+    while p.nth(hash_count) == T![#] && hash_count < 6 {
+        hash_count += 1;
+    }
+
+    // Must be followed by whitespace or EOL to be a valid heading
+    let next = p.nth(hash_count);
+    next == WHITESPACE || next == NEWLINE || next == T![EOF]
+}
+
+pub(crate) fn parse_atx_heading(p: &mut MarkdownParser) -> ParsedSyntax {
+    let m = p.start();
+
+    // Parse opening hash marks
+    let hash_list_m = p.start();
+    let mut hash_count = 0;
+    while p.at(T![#]) && hash_count < 6 {
+        p.bump(T![#]);
+        hash_count += 1;
+    }
+    hash_list_m.complete(p, MD_HASH_LIST);
+
+    // Skip whitespace after the hash marks
+    if p.at(WHITESPACE) {
+        p.bump(WHITESPACE);
+    }
+
+    // Parse heading content as a paragraph (optional)
+    if !p.at(NEWLINE) && !p.at(T![EOF]) {
+        let paragraph_m = p.start();
+
+        // Parse until end of line, or until trailing hashes
+        while !p.at(NEWLINE) && !p.at(T![EOF]) && !p.at(T![#]) {
+            p.bump(p.source().current());
+        }
+
+        paragraph_m.complete(p, MD_PARAGRAPH);
+    }
+
+    // Parse trailing hash marks (optional)
+    let trailing_hash_list_m = p.start();
+    while p.at(T![#]) {
+        p.bump(T![#]);
+    }
+    trailing_hash_list_m.complete(p, MD_HASH_LIST);
+
+    // Skip trailing whitespace
+    if p.at(WHITESPACE) {
+        p.bump(WHITESPACE);
+    }
+
+    // Consume the newline if present
+    if p.at(NEWLINE) {
+        p.bump(NEWLINE);
+    }
+
+    // Complete with the appropriate heading node type based on hash count
+    let node = match hash_count {
+        1 => MD_HEADER1,
+        2 => MD_HEADER2,
+        3 => MD_HEADER3,
+        4 => MD_HEADER4,
+        5 => MD_HEADER5,
+        6 => MD_HEADER6,
+        _ => MD_HEADER // Fallback, should not happen
+    };
+
+    Present(m.complete(p, node))
+}
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/atx-heading.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/atx-heading.md
@@ -0,0 +1,6 @@
+# Heading 1
+## Heading 2
+### Heading 3
+#### Heading 4
+##### Heading 5
+###### Heading 6
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/empty.md b/crates/biome_markdown_parser/tests/md_test_suite/ok/empty.md
diff --git a/crates/biome_markdown_parser/tests/md_test_suite/ok/empty.md.snap b/crates/biome_markdown_parser/tests/md_test_suite/ok/empty.md.snap
@@ -0,0 +1,31 @@
+---
+source: crates/biome_markdown_parser/tests/spec_test.rs
+expression: snapshot
+snapshot_kind: text
+---
+## Input
+
+```
+
+```
+
+
+## AST
+
+```
+MdDocument {
+    bom_token: missing (optional),
+    value: MdBlockList [],
+    eof_token: EOF@0..0 "" [] [],
+}
+```
+
+## CST
+
+```
+0: MD_DOCUMENT@0..0
+  0: (empty)
+  1: MD_BLOCK_LIST@0..0
+  2: EOF@0..0 "" [] []
+
+```
diff --git a/crates/biome_markdown_syntax/src/generated/kind.rs b/crates/biome_markdown_syntax/src/generated/kind.rs
diff --git a/crates/biome_markdown_syntax/src/generated/macros.rs b/crates/biome_markdown_syntax/src/generated/macros.rs
diff --git a/crates/biome_markdown_syntax/src/generated/nodes.rs b/crates/biome_markdown_syntax/src/generated/nodes.rs
diff --git a/crates/biome_markdown_syntax/src/generated/nodes_mut.rs b/crates/biome_markdown_syntax/src/generated/nodes_mut.rs
diff --git a/xtask/codegen/markdown.ungram b/xtask/codegen/markdown.ungram
@@ -58,6 +58,13 @@ AnyContainerBlock = MdQuote | MdBulletListItem | MdOrderListItem
 // h1..h6
 MdHeader = before:MdHashList MdParagraph? after:MdHashList
 
+MdHeader1 = before:MdHashList MdParagraph? after:MdHashList
+MdHeader2 = before:MdHashList MdParagraph? after:MdHashList
+MdHeader3 = before:MdHashList MdParagraph? after:MdHashList
+MdHeader4 = before:MdHashList MdParagraph? after:MdHashList
+MdHeader5 = before:MdHashList MdParagraph? after:MdHashList
+MdHeader6 = before:MdHashList MdParagraph? after:MdHashList
+
 MdHashList = MdHash*
 
 MdHash = '#'

diff --git a/xtask/codegen/src/markdown_kinds_src.rs b/xtask/codegen/src/markdown_kinds_src.rs
@@ -41,6 +41,12 @@ pub const MARKDOWN_KINDS_SRC: KindsSrc = KindsSrc {
         "MD_HASH_LIST",
         "MD_HASH",
         "MD_HEADER",
+        "MD_HEADER1",
+        "MD_HEADER2",
+        "MD_HEADER3",
+        "MD_HEADER4",
+        "MD_HEADER5",
+        "MD_HEADER6",
         "MD_INDENT_CODE_BLOCK",
         "MD_FENCED_CODE_BLOCK",
         "MD_HTML_BLOCK",