From d8c06d7d1b69839987bc3cdd7bb8bb0d80e828d6 Mon Sep 17 00:00:00 2001 From: Ian Johnson Date: Fri, 16 Jun 2023 18:22:54 -0400 Subject: [PATCH] feat: parse doctype in `Scanner` Closes #9 --- src/Scanner.zig | 1116 ++++++++++++++++++++++++++++++++++++++++++++--- src/syntax.zig | 32 ++ 2 files changed, 1098 insertions(+), 50 deletions(-) diff --git a/src/Scanner.zig b/src/Scanner.zig index e004318..81feac6 100644 --- a/src/Scanner.zig +++ b/src/Scanner.zig @@ -38,6 +38,10 @@ state: State = .start, pos: usize = 0, /// The current element nesting depth. depth: usize = 0, +/// Whether we are inside the doctype. +in_doctype: bool = false, +/// Whether the doctype has been seen already (or it is known to be absent). +seen_doctype: bool = false, /// Whether the root element has been seen already. seen_root_element: bool = false, @@ -83,6 +87,24 @@ pub const Token = union(enum) { ok, /// XML declaration. xml_declaration: XmlDeclaration, + /// Doctype start. + doctype_start: DoctypeStart, + /// Parameter entity in doctype. + parameter_entity: ParameterEntity, + /// Element declaration in doctype. + element_declaration: ElementDeclaration, + /// Start of attribute list declaration in doctype. + attlist_declaration_start: AttlistDeclarationStart, + /// Definition in attribute list declaration in doctype. + attlist_declaration_definition: AttlistDeclarationDefinition, + /// General entity declaration in doctype. + general_entity_declaration: GeneralEntityDeclaration, + /// Parameter entity declaration in doctype. + parameter_entity_declaration: ParameterEntityDeclaration, + /// Notation declaration in doctype. + notation_declaration: NotationDeclaration, + /// Doctype end. + doctype_end, /// Element start tag. element_start: ElementStart, /// Element content. @@ -110,6 +132,90 @@ pub const Token = union(enum) { standalone: ?bool = null, }; + pub const DoctypeStart = struct { + root_name: Range, + public_id: ?Range = null, + system_id: ?Range = null, + }; + + pub const ParameterEntity = struct { + name: Range, + }; + + pub const ElementDeclaration = struct { + name: Range, + content_spec: ContentSpec, + + pub const ContentSpec = union(enum) { + empty, + any, + mixed: struct { options: Range }, + children: struct { definition: Range }, + }; + }; + + pub const AttlistDeclarationStart = struct { + element_name: Range, + }; + + pub const AttlistDeclarationDefinition = struct { + name: Range, + type: AttributeType, + default: Default, + + pub const AttributeType = union(enum) { + cdata, + id, + idref, + idrefs, + entity, + entities, + nmtoken, + nmtokens, + notation: struct { options: Range }, + enumeration: struct { options: Range }, + }; + + pub const Default = union(enum) { + required, + implied, + fixed: struct { value: Range }, + }; + }; + + pub const GeneralEntityDeclaration = struct { + name: Range, + value: Value, + + pub const Value = union(enum) { + internal: struct { value: Range }, + external: struct { + public_id: ?Range = null, + system_id: Range, + ndata: ?Range = null, + }, + }; + }; + + pub const ParameterEntityDeclaration = struct { + name: Range, + value: Value, + + pub const Value = union(enum) { + internal: struct { value: Range }, + external: struct { + public_id: ?Range = null, + system_id: Range, + }, + }; + }; + + pub const NotationDeclaration = struct { + name: Range, + public_id: ?Range = null, + system_id: ?Range = null, + }; + pub const ElementStart = struct { name: Range, }; @@ -169,6 +275,10 @@ pub const Token = union(enum) { /// "v", left is "ersion", so that when we handle the next character, we can /// fail parsing if it is not "e", and then set `left` to "rsion", and so on). pub const State = union(enum) { + // Note: due to the extremely large number of states in the state machine, + // they are organized roughly in the order one would expect to encounter + // them in a document, to make it slightly easier to follow. + /// Start of document. start, /// Start of document after BOM. @@ -224,6 +334,165 @@ pub const State = union(enum) { /// Start of document after XML declaration. start_after_xml_decl, + // Doctype parsing follows. + // Abandon hope all ye who enter here. + + /// A ' + /// After some part of ''. + doctype_element_decl_before_end, + + // + /// After some part of ' + /// After some part of ' + /// After some part of ' if (c == 0xFEFF or syntax.isSpace(c)) { self.state = .start_after_bom; @@ -379,13 +656,15 @@ fn nextNoAdvance(self: *Scanner, c: u21, len: usize) error{SyntaxError}!Token { }, .unknown_document_start => if (syntax.isNameStartChar(c)) { + if (self.depth == 0) { + self.seen_doctype = true; + } self.state = .{ .element_start_name = .{ .start = self.pos } }; return .ok; } else if (c == '?') { self.state = .{ .pi_or_xml_decl_start = .{ .start = self.pos + len } }; return .ok; } else if (c == '!') { - // TODO: doctype self.state = .unknown_start_bang; return .ok; } else { @@ -632,118 +911,855 @@ fn nextNoAdvance(self: *Scanner, c: u21, len: usize) error{SyntaxError}!Token { return error.SyntaxError; }, - .unknown_start => if (syntax.isNameStartChar(c) and !self.seen_root_element) { - self.state = .{ .element_start_name = .{ .start = self.pos } }; + .doctype_start => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_after_start; + } else { + self.state = .{ .doctype_start = .{ .left = state.left[1..] } }; + } return .ok; - } else if (c == '/' and self.depth > 0) { - self.state = .element_end; + } else { + return error.SyntaxError; + }, + + .doctype_after_start => if (syntax.isSpace(c)) { return .ok; - } else if (c == '!') { - self.state = .unknown_start_bang; + } else if (syntax.isNameStartChar(c)) { + self.state = .{ .doctype_root_name = .{ .start = self.pos } }; return .ok; - } else if (c == '?') { - self.state = .pi; + } else { + return error.SyntaxError; + }, + + .doctype_root_name => |state| if (syntax.isNameChar(c)) { return .ok; + } else if (syntax.isSpace(c)) { + self.state = .{ .doctype_after_root_name = .{ .root_name = .{ .start = state.start, .end = self.pos } } }; } else { return error.SyntaxError; }, - .unknown_start_bang => if (c == '-') { - self.state = .comment_before_start; + .doctype_after_root_name => |state| if (syntax.isSpace(c)) { return .ok; - } else if (self.depth > 0 and c == '[') { - // Textual content is not allowed outside the root element. - self.state = .{ .cdata_before_start = .{ .left = "CDATA[" } }; + } else if (c == 'P') { + self.state = .{ .doctype_public_start = .{ .root_name = state.root_name, .left = "UBLIC " } }; return .ok; + } else if (c == 'S') { + self.state = .{ .doctype_system_start = .{ .root_name = state.root_name, .left = "YSTEM " } }; + return .ok; + } else if (c == '[') { + self.state = .doctype_internal_subset; + return .{ .doctype_start = .{ .root_name = state.root_name } }; } else { return error.SyntaxError; }, - .comment_before_start => if (c == '-') { - self.state = .{ .comment = .{ .start = self.pos + len } }; - return .comment_start; + .doctype_public_start => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_before_public_id = .{ .root_name = state.root_name } }; + } else { + self.state = .{ .doctype_public_start = .{ .left = state.left[1..] } }; + } + return .ok; } else { return error.SyntaxError; }, - .comment => |state| if (c == '-') { - self.state = .{ .comment_maybe_before_end = .{ .start = state.start, .end = self.pos } }; + .doctype_system_start => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_before_system_id = .{ .root_name = state.root_name, .public_id = null } }; + } else { + self.state = .{ .doctype_system_start = .{ .left = state.left[1..] } }; + } return .ok; - } else if (syntax.isChar(c)) { + } else { + return error.SyntaxError; + }, + + .doctype_before_public_id => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == '"' or c == '\'') { + self.state = .{ .doctype_public_id = .{ .root_name = state.root_name, .start = self.pos + len, .quote = @intCast(u8, c) } }; return .ok; } else { return error.SyntaxError; }, - .comment_maybe_before_end => |state| if (c == '-') { - self.state = .comment_before_end; - return .{ .comment_content = .{ .content = .{ .start = state.start, .end = state.end }, .final = true } }; + .doctype_public_id => |state| if (c == state.quote) { + self.state = .{ .doctype_before_system_id = .{ .root_name = state.root_name, .public_id = .{ .start = state.start, .end = self.pos } } }; + return .ok; + } else if (syntax.isPubidChar(c)) { + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_before_system_id => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == '"' or c == '\'') { + self.state = .{ .doctype_system_id = .{ .root_name = state.root_name, .public_id = state.public_id, .start = self.pos + len, .quote = @intCast(u8, c) } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_system_id => |state| if (c == state.quote) { + self.state = .doctype_after_external_id; + return .{ .doctype_start = .{ .root_name = state.root_name, .public_id = state.public_id, .system_id = state.system_id } }; } else if (syntax.isChar(c)) { - self.state = .{ .comment = .{ .start = state.start } }; return .ok; } else { return error.SyntaxError; }, - .comment_before_end => if (c == '>') { - self.state = .{ .content = .{ .start = self.pos + len } }; + .doctype_after_external_id => if (syntax.isSpace(c)) { return .ok; + } else if (c == '[') { + self.state = .doctype_internal_subset; + } else if (c == '>') { + self.in_doctype = false; + self.seen_doctype = true; + self.state = .start_after_xml_decl; + return .doctype_end; } else { return error.SyntaxError; }, - .pi => if (syntax.isNameStartChar(c)) { - self.state = .{ .pi_target = .{ .start = self.pos, .xml_seen = (TokenMatcher("xml"){}).accept(c) } }; + .doctype_internal_subset => if (syntax.isSpace(c)) { + return .ok; + } else if (c == '%') { + self.state = .doctype_pe_ref_start; + return .ok; + } else if (c == '<') { + self.state = .doctype_unknown_start; return .ok; + } else if (c == ']') { + self.state = .doctype_after_internal_subset; + return .doctype_end; } else { return error.SyntaxError; }, - .pi_target => |state| if (syntax.isNameChar(c)) { - self.state = .{ .pi_target = .{ .start = state.start, .xml_seen = state.xml_seen.accept(c) } }; + .doctype_pe_ref_start => if (syntax.isNameStartChar(c)) { + self.state = .{ .doctype_pe_ref_name = .{ .start = self.pos } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_pe_ref_name => |state| if (syntax.isNameChar(c)) { + return .ok; + } else if (c == ';') { + self.state = .doctype_internal_subset; + return .{ .parameter_entity = .{ .start = state.start, .end = self.pos } }; + } else { + return error.SyntaxError; + }, + + .doctype_unknown_start => if (c == '!') { + self.state = .doctype_unknown_start_bang; + return .ok; + } else if (c == '?') { + self.state = .pi; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_unknown_start_bang => if (c == '-') { + self.state = .comment_before_start; + return .ok; + } else if (c == 'E') { + self.state = .doctype_unknown_start_e; + return .ok; + } else if (c == 'A') { + self.state = .{ .doctype_attlist_decl_start = .{ .left = "TTLIST " } }; + return .ok; + } else if (c == 'N') { + self.state = .{ .doctype_notation_decl_start = .{ .left = "OTATION " } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_unknown_start_e => if (c == 'L') { + self.state = .{ .doctype_element_decl_start = .{ .left = "EMENT " } }; + return .ok; + } else if (c == 'N') { + self.state = .{ .doctype_entity_decl_start = .{ .left = "TITY " } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_element_decl_start => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_element_decl_after_start; + } else { + self.state = .{ .doctype_element_decl_start = .{ .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_element_decl_after_start => if (syntax.isSpace(c)) { + return .ok; + } else if (syntax.isNameStartChar(c)) { + self.state = .{ .doctype_element_decl_name = .{ .start = self.pos } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_element_name => |state| if (syntax.isNameChar(c)) { return .ok; } else if (syntax.isSpace(c)) { - if (state.xml_seen.matches()) { - // PI named 'xml' is not allowed - return error.SyntaxError; + self.state = .{ .doctype_element_decl_after_name = .{ .name = .{ .start = state.start, .end = self.pos } } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_element_decl_after_name => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == 'E') { + self.state = .{ .doctype_element_decl_empty = .{ .name = state.name, .left = "MPTY" } }; + return .ok; + } else if (c == 'A') { + self.state = .{ .doctype_element_decl_any = .{ .name = state.name, .left = "NY" } }; + return .ok; + } else if (c == '(') { + self.state = .{ .doctype_element_decl_after_paren = .{ .name = state.name } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_element_decl_empty => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_element_decl_before_end; + return .{ .element_declaration = .{ .name = state.name, .content_spec = .empty } }; } else { - self.state = .pi_after_target; - return .{ .pi_start = .{ .target = .{ .start = state.start, .end = self.pos } } }; + self.state = .{ .doctype_element_decl_empty = .{ .name = state.name, .left = state.left[1..] } }; + return .ok; } - } else if (c == '?') { - if (state.xml_seen.matches()) { - return error.SyntaxError; + } else { + return error.SyntaxError; + }, + + .doctype_element_decl_any => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_element_decl_before_end; + return .{ .element_declaration = .{ .name = state.name, .content_spec = .any } }; } else { - self.state = .{ .pi_maybe_end = .{ .start = self.pos, .end = self.pos } }; - return .{ .pi_start = .{ .target = .{ .start = state.start, .end = self.pos } } }; + self.state = .{ .doctype_element_decl_any = .{ .name = state.name, .left = state.left[1..] } }; + return .ok; } } else { return error.SyntaxError; }, - .pi_after_target => if (syntax.isSpace(c)) { + .doctype_element_decl_after_paren => |state| if (syntax.isSpace(c)) { return .ok; - } else if (syntax.isChar(c)) { - self.state = .{ .pi_content = .{ .start = self.pos } }; + } else if (c == '#') { + self.state = .{ .doctype_element_decl_pcdata = .{ .name = state.name, .left = "PCDATA" } }; return .ok; - } else if (c == '?') { - self.state = .{ .pi_maybe_end = .{ .start = self.pos, .end = self.pos } }; + } else if (c == '(') { + // TODO: children spec parsing goes here + return error.SyntaxError; + } else { + return error.SyntaxError; + }, + + .doctype_element_decl_pcdata => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_element_decl_mixed = .{ .name = state.name, .start = self.pos + len } }; + } else { + self.state = .{ .doctype_eement_decl_pcdata = .{ .name = state.name, .left = state.left[1..] } }; + } return .ok; } else { return error.SyntaxError; }, - .pi_content => |state| if (c == '?') { - self.state = .{ .pi_maybe_end = .{ .start = state.start, .end = self.pos } }; + .doctype_element_decl_mixed => |state| if (syntax.isSpace(c)) { return .ok; - } else if (syntax.isChar(c)) { + } else if (c == '|') { + self.state = .{ .doctype_element_decl_mixed_before_name = .{ .name = state.name, .start = state.start } }; return .ok; + } else if (c == ')') { + self.state = .doctype_element_decl_before_end; + return .{ .element_declaration = .{ .name = state.name, .content_spec = .{ .mixed = .{ .options = .{ .start = state.start, .end = self.pos } } } } }; } else { return error.SyntaxError; }, - .pi_maybe_end => |state| if (c == '>') { - self.state = .{ .content = .{ .start = self.pos + len } }; + .doctype_element_decl_mixed_before_name => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (syntax.isNameStart(c)) { + self.state = .{ .doctype_element_decl_mixed_name = .{ .name = state.name, .start = state.start } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_element_decl_mixed_name => |state| if (syntax.isNameChar(c)) { + return .ok; + } else if (syntax.isSpace(c)) { + self.state = .{ .doctype_element_decl_mixed = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == '|') { + self.state = .{ .doctype_element_decl_mixed_before_name = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == ')') { + self.state = .doctype_element_decl_before_end; + return .{ .element_declaration = .{ .name = state.name, .content_spec = .{ .mixed = .{ .options = .{ .start = state.start, .end = self.pos } } } } }; + }, + + .doctype_element_decl_before_end => if (syntax.isSpace(c)) { + return .ok; + } else if (c == '>') { + self.state = .doctype_internal_subset; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_start => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_attlist_decl_after_start; + } else { + self.state = .{ .doctype_attlist_decl_start = .{ .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_after_start => if (syntax.isSpace(c)) { + return .ok; + } else if (syntax.isNameStartChar(c)) { + self.state = .{ .doctype_attlist_decl_name = .{ .start = self.pos } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_name => |state| if (syntax.isNameChar(c)) { + return .ok; + } else if (syntax.isSpace(c)) { + self.state = .doctype_attlist_decl_def; + return .{ .attlist_declaration_start = .{ .element_name = .{ .start = state.start, .end = self.pos } } }; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def => if (syntax.isSpace(c)) { + return .ok; + } else if (syntax.isNameStartChar(c)) { + self.state = .{ .doctype_attlist_decl_def_name = .{ .start = self.pos } }; + return .ok; + } else if (c == '>') { + self.state = .doctype_internal_subset; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_name => |state| if (syntax.isNameChar(c)) { + return .ok; + } else if (syntax.isSpace(c)) { + self.state = .{ .doctype_attlist_decl_def_after_name = .{ .name = .{ .start = state.start, .end = self.pos } } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_name => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == 'C') { + self.state = .{ .doctype_attlist_decl_def_cdata = .{ .name = state.name, .left = "DATA" } }; + return .ok; + } else if (c == 'I') { + self.state = .{ .doctype_attlist_decl_def_id = .{ .name = state.name, .left = "D" } }; + return .ok; + } else if (c == 'E') { + self.state = .{ .doctype_attlist_decl_def_entit = .{ .name = state.name, .left = "NTIT" } }; + return .ok; + } else if (c == 'N') { + self.state = .{ .doctype_attlist_decl_def_after_n = .{ .name = state.name } }; + return .ok; + } else if (c == '(') { + self.state = .{ .doctype_attlist_decl_enumeration_before_option = .{ .name = state.name, .start = self.pos + len } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_cdata => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_after_type = .{ .name = state.name, .type = .cdata } }; + } else { + self.state = .{ .doctype_attlist_decl_def_cdata = .{ .name = state.name, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_id => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_def_after_id = .{ .name = state.name } }; + } else { + self.state = .{ .doctype_attlist_decl_def_id = .{ .name = state.name, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_id => |state| if (syntax.isSpace(c)) { + self.state = .{ .doctype_attlist_decl_before_default = .{ .name = state.name, .type = .id } }; + return .ok; + } else if (c == 'R') { + self.state = .{ .doctype_attlist_decl_idref = .{ .name = state.name, .left = "EF" } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_idref => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_def_after_idref = .{ .name = state.name } }; + } else { + self.state = .{ .doctype_attlist_decl_def_idref = .{ .name = state.name, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_idref => |state| if (syntax.isSpace(c)) { + self.state = .{ .doctype_attlist_decl_before_default = .{ .name = state.name, .type = .idref } }; + return .ok; + } else if (c == 'S') { + self.state = .{ .doctype_attlist_decl_after_type = .{ .name = state.name, .type = .idrefs } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_entit => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_def_after_entit = .{ .name = state.name } }; + } else { + self.state = .{ .doctype_attlist_decl_def_entit = .{ .name = state.name, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_entit => |state| if (c == 'Y') { + self.state = .{ .doctype_attlist_decl_after_type = .{ .name = state.name, .type = .entity } }; + return .ok; + } else if (c == 'I') { + self.state = .{ .doctype_attlist_decl_def_entities = .{ .name = state.name, .left = "ES" } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_entities => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_after_type = .{ .name = state.name, .type = .entities } }; + } else { + self.state = .{ .doctype_attlist_decl_def_entities = .{ .name = state.name, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_n => |state| if (c == 'M') { + self.state = .{ .doctype_attlist_decl_def_nmtoken = .{ .name = state.name, .left = "TOKEN" } }; + return .ok; + } else if (c == 'O') { + self.state = .{ .doctype_attlist_decl_notation = .{ .name = state.name, .left = "TATION " } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_nmtoken => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_def_after_nmtoken = .{ .name = state.name } }; + } else { + self.state = .{ .doctype_attlist_decl_def_nmtoken = .{ .name = state.name, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_nmtoken => |state| if (syntax.isSpace(c)) { + self.state = .{ .doctype_attlist_decl_def_before_default = .{ .name = state.name, .type = .nmtoken } }; + return .ok; + } else if (c == 'S') { + self.state = .{ .doctype_attlist_decl_def_after_type = .{ .name = state.name, .type = .nmtokens } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_notation => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_def_after_notation = .{ .name = state.name } }; + } else { + self.state = .{ .doctype_attlist_decl_def_notation = .{ .name = state.name, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_notation => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == '(') { + self.state = .{ .doctype_attlist_decl_def_notation_before_option = .{ .name = state.name, .start = self.pos + len } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_notation_before_option => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (syntax.isNameStartChar(c)) { + self.state = .{ .doctype_attlist_decl_def_notation_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_notation_option => |state| if (syntax.isNameChar(c)) { + return .ok; + } else if (syntax.isSpace(c)) { + self.state = .{ .doctype_attlist_decl_def_notation_after_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == '|') { + self.state = .{ .doctype_attlist_decl_def_notation_before_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == ')') { + self.state = .{ .doctype_attlist_decl_def_after_type = .{ .name = state.name, .type = .{ .notation = .{ .options = .{ .start = state.start, .end = self.pos } } } } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_notation_after_option => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == '|') { + self.state = .{ .doctype_attlist_decl_def_notation_before_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == ')') { + self.state = .{ .doctype_attlist_decl_def_after_type = .{ .name = state.name, .type = .{ .notation = .{ .options = .{ .start = state.start, .end = self.pos } } } } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_enumeration_before_option => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (syntax.isNameStartChar(c)) { + self.state = .{ .doctype_attlist_decl_def_enumeration_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_enumeration_option => |state| if (syntax.isNameChar(c)) { + return .ok; + } else if (syntax.isSpace(c)) { + self.state = .{ .doctype_attlist_decl_def_enumeration_after_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == '|') { + self.state = .{ .doctype_attlist_decl_def_enumeration_before_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == ')') { + self.state = .{ .doctype_attlist_decl_def_after_type = .{ .name = state.name, .type = .{ .enumeration = .{ .options = .{ .start = state.start, .end = self.pos } } } } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_enumeration_after_option => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == '|') { + self.state = .{ .doctype_attlist_decl_def_enumeration_before_option = .{ .name = state.name, .start = state.start } }; + return .ok; + } else if (c == ')') { + self.state = .{ .doctype_attlist_decl_def_after_type = .{ .name = state.name, .type = .{ .enumeration = .{ .options = .{ .start = state.start, .end = self.pos } } } } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_type => |state| if (syntax.isSpace(c)) { + self.state = .{ .doctype_attlist_decl_def_before_default = .{ .name = state.name, .type = state.type } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_before_default => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == '#') { + self.state = .{ .doctype_attlist_decl_def_default = .{ .name = state.name, .type = state.type } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_default => |state| if (c == 'R') { + self.state = .{ .doctype_attlist_decl_def_required = .{ .name = state.name, .type = state.type, .left = "EQUIRED" } }; + return .ok; + } else if (c == 'I') { + self.state = .{ .doctype_attlist_decl_def_implied = .{ .name = state.name, .type = state.type, .left = "MPLIED" } }; + return .ok; + } else if (c == 'F') { + self.state = .{ .doctype_attlist_decl_def_fixed = .{ .name = state.name, .type = state.type, .left = "IXED " } }; + return .ok; + } else if (c == '"' or c == '\'') { + self.state = .{ .doctype_attlist_decl_def_fixed_value = .{ .name = state.name, .type = state.type, .start = self.pos + len, .quote = @intCast(u8, c) } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_required => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_attlist_decl_after_def; + return .{ .attlist_declaration_definition = .{ .name = state.name, .type = state.type, .default = .required } }; + } else { + self.state = .{ .doctype_attlist_decl_def_required = .{ .name = state.name, .type = state.type, .left = state.left[1..] } }; + return .ok; + } + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_implied => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_attlist_decl_after_def; + return .{ .attlist_declaration_definition = .{ .name = state.name, .type = state.type, .default = .implied } }; + } else { + self.state = .{ .doctype_attlist_decl_def_implied = .{ .name = state.name, .type = state.type, .left = state.left[1..] } }; + return .ok; + } + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_fixed => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .{ .doctype_attlist_decl_def_after_fixed = .{ .name = state.name, .type = state.type } }; + } else { + self.state = .{ .doctype_attlist_decl_def_fixed = .{ .name = state.name, .type = state.type, .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_after_fixed => |state| if (syntax.isSpace(c)) { + return .ok; + } else if (c == '"' or c == '\'') { + self.state = .{ .doctype_attlist_decl_def_fixed_value = .{ .name = state.name, .type = state.type, .start = self.pos + len, .quote = @intCast(u8, c) } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_def_fixed_value => |state| if (c == state.quote) { + self.state = .doctype_attlist_decl_after_def; + return .{ .attlist_declaration_definition = .{ .name = state.name, .type = state.type, .default = .{ .fixed = .{ .value = .{ .start = state.start, .end = self.pos } } } } }; + } else if (syntax.isChar(c)) { + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_attlist_decl_after_def => if (syntax.isSpace(c)) { + self.state = .doctype_attlist_decl_def; + return .ok; + } else if (c == '>') { + self.state = .doctype_internal_subset; + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_entity_decl_start => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_entity_decl_after_start; + } else { + self.state = .{ .doctype_entity_decl_start = .{ .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_notation_decl_start => |state| if (c == state.left[0]) { + if (state.left.len == 1) { + self.state = .doctype_notation_decl_after_start; + } else { + self.state = .{ .doctype_notation_decl_start = .{ .left = state.left[1..] } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .doctype_after_internal_subset => if (syntax.isSpace(c)) { + return .ok; + } else if (c == '>') { + self.in_doctype = false; + self.seen_doctype = true; + self.state = .start_after_xml_decl; + return .ok; + } else { + return error.SyntaxError; + }, + + .unknown_start => if (syntax.isNameStartChar(c) and !self.seen_root_element) { + if (self.depth == 0) { + self.seen_doctype = true; + } + self.state = .{ .element_start_name = .{ .start = self.pos } }; + return .ok; + } else if (c == '/' and self.depth > 0) { + self.state = .element_end; + return .ok; + } else if (c == '!') { + self.state = .unknown_start_bang; + return .ok; + } else if (c == '?') { + self.state = .pi; + return .ok; + } else { + return error.SyntaxError; + }, + + .unknown_start_bang => if (c == '-') { + self.state = .comment_before_start; + return .ok; + } else if (self.depth > 0 and c == '[') { + // Textual content is not allowed outside the root element. + self.state = .{ .cdata_before_start = .{ .left = "CDATA[" } }; + return .ok; + } else if (!self.seen_doctype and c == 'D') { + self.in_doctype = true; + self.state = .{ .doctype_start = .{ .left = "OCTYPE " } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .comment_before_start => if (c == '-') { + self.state = .{ .comment = .{ .start = self.pos + len } }; + return .comment_start; + } else { + return error.SyntaxError; + }, + + .comment => |state| if (c == '-') { + self.state = .{ .comment_maybe_before_end = .{ .start = state.start, .end = self.pos } }; + return .ok; + } else if (syntax.isChar(c)) { + return .ok; + } else { + return error.SyntaxError; + }, + + .comment_maybe_before_end => |state| if (c == '-') { + self.state = .comment_before_end; + return .{ .comment_content = .{ .content = .{ .start = state.start, .end = state.end }, .final = true } }; + } else if (syntax.isChar(c)) { + self.state = .{ .comment = .{ .start = state.start } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .comment_before_end => if (c == '>') { + if (self.in_doctype) { + self.state = .doctype_internal_subset; + } else { + self.state = .{ .content = .{ .start = self.pos + len } }; + } + return .ok; + } else { + return error.SyntaxError; + }, + + .pi => if (syntax.isNameStartChar(c)) { + self.state = .{ .pi_target = .{ .start = self.pos, .xml_seen = (TokenMatcher("xml"){}).accept(c) } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .pi_target => |state| if (syntax.isNameChar(c)) { + self.state = .{ .pi_target = .{ .start = state.start, .xml_seen = state.xml_seen.accept(c) } }; + return .ok; + } else if (syntax.isSpace(c)) { + if (state.xml_seen.matches()) { + // PI named 'xml' is not allowed + return error.SyntaxError; + } else { + self.state = .pi_after_target; + return .{ .pi_start = .{ .target = .{ .start = state.start, .end = self.pos } } }; + } + } else if (c == '?') { + if (state.xml_seen.matches()) { + return error.SyntaxError; + } else { + self.state = .{ .pi_maybe_end = .{ .start = self.pos, .end = self.pos } }; + return .{ .pi_start = .{ .target = .{ .start = state.start, .end = self.pos } } }; + } + } else { + return error.SyntaxError; + }, + + .pi_after_target => if (syntax.isSpace(c)) { + return .ok; + } else if (syntax.isChar(c)) { + self.state = .{ .pi_content = .{ .start = self.pos } }; + return .ok; + } else if (c == '?') { + self.state = .{ .pi_maybe_end = .{ .start = self.pos, .end = self.pos } }; + return .ok; + } else { + return error.SyntaxError; + }, + + .pi_content => |state| if (c == '?') { + self.state = .{ .pi_maybe_end = .{ .start = state.start, .end = self.pos } }; + return .ok; + } else if (syntax.isChar(c)) { + return .ok; + } else { + return error.SyntaxError; + }, + + .pi_maybe_end => |state| if (c == '>') { + if (self.in_doctype) { + self.state = .doctype_internal_subset; + } else { + self.state = .{ .content = .{ .start = self.pos + len } }; + } return .{ .pi_content = .{ .content = .{ .start = state.start, .end = state.end }, .final = true } }; } else if (syntax.isChar(c)) { self.state = .{ .pi_content = .{ .start = state.start } }; diff --git a/src/syntax.zig b/src/syntax.zig index d55f9ea..33fb58f 100644 --- a/src/syntax.zig +++ b/src/syntax.zig @@ -104,3 +104,35 @@ pub inline fn isEncodingChar(c: u21) bool { else => false, }; } + +pub inline fn isPubidChar(c: u21) bool { + return switch (c) { + ' ', + '\r', + '\n', + 'a'...'z', + 'A'...'Z', + '0'...'9', + '-', + '\'', + '(', + ')', + '+', + ',', + '.', + '/', + ':', + '=', + '?', + ';', + '!', + '*', + '#', + '@', + '$', + '_', + '%', + => true, + else => false, + }; +}