diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseString.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseString.ts new file mode 100644 index 0000000000..72557334b9 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseString.ts @@ -0,0 +1,43 @@ +import { TrimEndRule, TrimStartRule } from './types'; + +export const collapseString = ( + text: string, + { + trimStart = 'collapse', + trimEnd = 'collapse', + shouldCollapseWhiteSpace = true, + whiteSpaceIncludesNewlines = true, + }: { + trimStart?: TrimStartRule; + trimEnd?: TrimEndRule; + shouldCollapseWhiteSpace?: boolean; + whiteSpaceIncludesNewlines?: boolean; + } = {} +) => { + if (trimStart === 'all') { + text = text.replace(/^\s+/, ''); + } + + if (trimEnd === 'single-newline') { + // Strip at most one newline from the end + text = text.replace(/\n$/, ''); + } + + if (shouldCollapseWhiteSpace) { + if (whiteSpaceIncludesNewlines) { + text = text.replaceAll(/\s+/g, ' '); + } else { + // Collapse horizontal whitespace + text = text.replaceAll(/[^\S\n\r]+/g, ' '); + + /** + * Trim horizontal whitespace from the start and end of lines (behavior + * of pre-line). + */ + text = text.replaceAll(/^[^\S\n\r]+/gm, ''); + text = text.replaceAll(/[^\S\n\r]+$/gm, ''); + } + } + + return text; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.spec.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.spec.ts similarity index 99% rename from packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.spec.ts rename to packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.spec.ts index 592a1396e7..af564eeb1a 100644 --- a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.spec.ts +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.spec.ts @@ -1,5 +1,5 @@ +import { htmlStringToDOMNode } from '../htmlStringToDOMNode'; import { collapseWhiteSpace } from './collapseWhiteSpace'; -import { htmlStringToDOMNode } from './htmlStringToDOMNode'; const expectCollapsedWhiteSpace = (input: string, expected: string) => { const element = htmlStringToDOMNode(input); diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.ts new file mode 100644 index 0000000000..eed683d972 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.ts @@ -0,0 +1,17 @@ +import { collapseWhiteSpaceElement } from './collapseWhiteSpaceElement'; +import { CollapseWhiteSpaceState } from './types'; + +// Entrypoint +export const collapseWhiteSpace = (element: HTMLElement) => { + const clonedElement = element.cloneNode(true) as HTMLElement; + + // Mutable state object + const state: CollapseWhiteSpaceState = { + inlineFormattingContext: null, + whiteSpaceRule: 'normal', + }; + + collapseWhiteSpaceElement(clonedElement, state); + + return clonedElement; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceChildren.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceChildren.ts new file mode 100644 index 0000000000..9ecd62c4ad --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceChildren.ts @@ -0,0 +1,13 @@ +import { collapseWhiteSpaceNode } from './collapseWhiteSpaceNode'; +import { CollapseWhiteSpaceState } from './types'; + +export const collapseWhiteSpaceChildren = ( + node: Node, + state: CollapseWhiteSpaceState +) => { + const childNodes = Array.from(node.childNodes); + + for (const childNode of childNodes) { + collapseWhiteSpaceNode(childNode, state); + } +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceElement.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceElement.ts new file mode 100644 index 0000000000..be3ac6ca72 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceElement.ts @@ -0,0 +1,38 @@ +import { collapseWhiteSpaceChildren } from './collapseWhiteSpaceChildren'; +import { inferWhiteSpaceRule } from './inferWhiteSpaceRule'; +import { isHtmlInlineElement } from './isHtmlInlineElement'; +import { endInlineFormattingContext } from './stateTransforms'; +import { CollapseWhiteSpaceState } from './types'; + +export const collapseWhiteSpaceElement = ( + element: HTMLElement, + state: CollapseWhiteSpaceState +) => { + const isInlineElement = isHtmlInlineElement(element); + const previousWhiteSpaceRule = state.whiteSpaceRule; + const inferredWhiteSpaceRule = inferWhiteSpaceRule(element); + + if (inferredWhiteSpaceRule) { + state.whiteSpaceRule = inferredWhiteSpaceRule; + } + + /** + * Note: We do not want to start an inline formatting context until we + * encounter a text node. + */ + + // End any existing inline formatting context + if (!isInlineElement) { + endInlineFormattingContext(state); + } + + collapseWhiteSpaceChildren(element, state); + + // Do not let inline formatting context break out of block elements + if (!isInlineElement) { + endInlineFormattingContext(state); + } + + // Restore previous whiteSpaceRule + state.whiteSpaceRule = previousWhiteSpaceRule; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceNode.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceNode.ts new file mode 100644 index 0000000000..3b6ce09e32 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceNode.ts @@ -0,0 +1,23 @@ +import { isHtmlElement } from '../isHtmlElement'; +import { isHtmlText } from '../isHtmlText'; +import { collapseWhiteSpaceChildren } from './collapseWhiteSpaceChildren'; +import { collapseWhiteSpaceElement } from './collapseWhiteSpaceElement'; +import { collapseWhiteSpaceText } from './collapseWhiteSpaceText'; +import { CollapseWhiteSpaceState } from './types'; + +export const collapseWhiteSpaceNode = ( + node: Node, + state: CollapseWhiteSpaceState +) => { + if (isHtmlElement(node)) { + collapseWhiteSpaceElement(node as HTMLElement, state); + return; + } + + if (isHtmlText(node)) { + collapseWhiteSpaceText(node as Text, state); + return; + } + + collapseWhiteSpaceChildren(node, state); +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceText.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceText.ts new file mode 100644 index 0000000000..2288681fe9 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceText.ts @@ -0,0 +1,69 @@ +import { collapseString } from './collapseString'; +import { isLastNonEmptyTextOfInlineFormattingBlock } from './isLastNonEmptyTextOfInlineFormattingBlock'; +import { upsertInlineFormattingContext } from './stateTransforms'; +import { CollapseWhiteSpaceState, TrimEndRule, TrimStartRule } from './types'; + +export const collapseWhiteSpaceText = ( + text: Text, + state: CollapseWhiteSpaceState +) => { + const textContent = text.textContent || ''; + const isWhiteSpaceOnly = textContent.trim() === ''; + + // Do not start an inline formatting context with a whiteSpace-only text node + if (state.inlineFormattingContext || !isWhiteSpaceOnly) { + upsertInlineFormattingContext(state); + } + + const { whiteSpaceRule } = state; + + /** + * Note: Due to the way HTML strings are parsed in htmlStringToDOMNode, up to + * one newline is already trimmed from the start of text nodes inside
+ * elements. If we do so again here, we may remove too many newlines. This + * only applies to actualelements, not elements with the white-space + * CSS property. + */ + const trimStart: TrimStartRule = (() => { + if (whiteSpaceRule !== 'normal') return 'collapse'; + + if ( + !state.inlineFormattingContext || + state.inlineFormattingContext.atStart || + state.inlineFormattingContext.lastHasTrailingWhiteSpace + ) + return 'all'; + + return 'collapse'; + })(); + + const trimEnd: TrimEndRule = (() => { + if (whiteSpaceRule === 'normal') return 'collapse'; + if (isLastNonEmptyTextOfInlineFormattingBlock(text)) + return 'single-newline'; + return 'collapse'; + })(); + + const shouldCollapseWhiteSpace: boolean = { + normal: true, + 'actual-pre': false, + pre: false, + 'pre-line': true, + }[whiteSpaceRule]; + + const whiteSpaceIncludesNewlines = whiteSpaceRule !== 'pre-line'; + + const collapsedTextContent = collapseString(textContent || '', { + trimStart, + trimEnd, + shouldCollapseWhiteSpace, + whiteSpaceIncludesNewlines, + }); + + if (state.inlineFormattingContext && shouldCollapseWhiteSpace) { + state.inlineFormattingContext.lastHasTrailingWhiteSpace = + collapsedTextContent.endsWith(' '); + } + + text.textContent = collapsedTextContent; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/index.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/index.ts new file mode 100644 index 0000000000..701919be7d --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/index.ts @@ -0,0 +1,17 @@ +/** + * @file Automatically generated by barrelsby. + */ + +export * from './collapseString'; +export * from './collapseWhiteSpace'; +export * from './collapseWhiteSpaceChildren'; +export * from './collapseWhiteSpaceElement'; +export * from './collapseWhiteSpaceNode'; +export * from './collapseWhiteSpaceText'; +export * from './inferWhiteSpaceRule'; +export * from './inlineTagNames'; +export * from './isHtmlBlockElement'; +export * from './isHtmlInlineElement'; +export * from './isLastNonEmptyTextOfInlineFormattingBlock'; +export * from './stateTransforms'; +export * from './types'; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inferWhiteSpaceRule.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inferWhiteSpaceRule.ts new file mode 100644 index 0000000000..cd9bab8462 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inferWhiteSpaceRule.ts @@ -0,0 +1,32 @@ +import { WhiteSpaceRule } from './types'; + +export const inferWhiteSpaceRule = ( + element: HTMLElement +): WhiteSpaceRule | null => { + const whiteSpaceProperty = element.style.whiteSpace; + + switch (whiteSpaceProperty) { + case 'normal': + case 'nowrap': { + return 'normal'; + } + case 'pre': + case 'pre-wrap': + case 'break-spaces': { + return 'pre'; + } + case 'pre-line': { + return 'pre-line'; + } + } + + if (element.tagName === 'PRE') { + return 'actual-pre'; + } + + if (whiteSpaceProperty === 'initial') { + return 'normal'; + } + + return null; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inlineTagNames.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inlineTagNames.ts new file mode 100644 index 0000000000..87883426fe --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inlineTagNames.ts @@ -0,0 +1,107 @@ +/** + * # Methodology + * + * ## Step 1. Get the list of all standard tag names + * + * Go to https://developer.mozilla.org/en-US/docs/Web/HTML/Element and run the + * following in the console to generate a JSON array of tag names: + * + * ```js + * JSON.stringify( + * Array.from(document.querySelectorAll('article table td:first-child')).map((td) => { + * const body = document.createElement('body'); + * body.innerHTML = td.textContent; + * return body.firstChild?.tagName; + * }).filter((tagName) => tagName) + * ); + * ``` + * + * Output (as of 2023-11-06): + * + * ```json + * '["BASE","LINK","META","STYLE","TITLE","ADDRESS","ARTICLE","ASIDE","FOOTER","HEADER","H1","HGROUP","MAIN","NAV","SECTION","SEARCH","BLOCKQUOTE","DD","DIV","DL","DT","FIGCAPTION","FIGURE","HR","LI","MENU","OL","P","PRE","UL","A","ABBR","B","BDI","BDO","BR","CITE","CODE","DATA","DFN","EM","I","KBD","MARK","Q","RP","RT","RUBY","S","SAMP","SMALL","SPAN","STRONG","SUB","SUP","TIME","U","VAR","WBR","AREA","AUDIO","IMG","MAP","TRACK","VIDEO","EMBED","IFRAME","OBJECT","PICTURE","PORTAL","SOURCE","svg","math","CANVAS","NOSCRIPT","SCRIPT","DEL","INS","TABLE","BUTTON","DATALIST","FIELDSET","FORM","INPUT","LABEL","LEGEND","METER","OPTGROUP","OPTION","OUTPUT","PROGRESS","SELECT","TEXTAREA","DETAILS","DIALOG","SUMMARY","SLOT","TEMPLATE","ACRONYM","BIG","CENTER","CONTENT","DIR","FONT","IMG","MARQUEE","MENUITEM","NOBR","NOEMBED","NOFRAMES","PARAM","PLAINTEXT","RB","RTC","SHADOW","STRIKE","TT","XMP"]' + * ``` + * + * ## Step 2. For each tag name, determine the default browser style + * + * Open an empty HTML file in the browser and run the following in the console: + * + * ```js + * const tagNames = JSON.parse(); + * + * JSON.stringify( + * tagNames.filter((tagName) => { + * const element = document.createElement(tagName); + * document.body.appendChild(element); + * const display = window.getComputedStyle(element).display; + * element.remove(); + * return display.startsWith('inline'); + * }) + * ); + * ``` + * + * Place the result in the array below (accurate as of 2023-11-06). + */ + +export const inlineTagNames = new Set([ + 'A', + 'ABBR', + 'B', + 'BDI', + 'BDO', + 'BR', + 'CITE', + 'CODE', + 'DATA', + 'DFN', + 'EM', + 'I', + 'KBD', + 'MARK', + 'Q', + 'S', + 'SAMP', + 'SMALL', + 'SPAN', + 'STRONG', + 'SUB', + 'SUP', + 'TIME', + 'U', + 'VAR', + 'WBR', + 'IMG', + 'MAP', + 'TRACK', + 'VIDEO', + 'EMBED', + 'IFRAME', + 'OBJECT', + 'PICTURE', + 'PORTAL', + 'SOURCE', + 'svg', + 'math', + 'CANVAS', + 'DEL', + 'INS', + 'BUTTON', + 'INPUT', + 'LABEL', + 'METER', + 'OUTPUT', + 'PROGRESS', + 'SELECT', + 'TEXTAREA', + 'ACRONYM', + 'BIG', + 'CONTENT', + 'FONT', + 'IMG', + 'MARQUEE', + 'MENUITEM', + 'NOBR', + 'SHADOW', + 'STRIKE', + 'TT', +]); diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlBlockElement.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlBlockElement.ts new file mode 100644 index 0000000000..d6260a62a0 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlBlockElement.ts @@ -0,0 +1,8 @@ +import { isHtmlElement } from '../isHtmlElement'; +import { isHtmlInlineElement } from './isHtmlInlineElement'; + +export const isHtmlBlockElement = (node: Node): boolean => { + if (!isHtmlElement(node)) return false; + const element = node as HTMLElement; + return !isHtmlInlineElement(element); +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlInlineElement.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlInlineElement.ts new file mode 100644 index 0000000000..5e8557218c --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlInlineElement.ts @@ -0,0 +1,45 @@ +import { isHtmlElement } from '../isHtmlElement'; +import { inlineTagNames } from './inlineTagNames'; + +export const isHtmlInlineElement = (node: Node): boolean => { + if (!isHtmlElement(node)) return false; + const element = node as HTMLElement; + + const tagNameIsInline = inlineTagNames.has(element.tagName); + + /** + * Valid display values include 'inline flow'. We only care about the first + * part. + */ + const displayProperty = element.style.display.split(' ')[0]; + + if (displayProperty === '') { + return tagNameIsInline; + } + + if (displayProperty.startsWith('inline')) { + return true; + } + + if (displayProperty === 'inherit' && element.parentElement) { + return isHtmlInlineElement(element.parentElement); + } + + /** + * Handle all special values manually, so that any unhandled values can be + * assumed to be block. + * + * Note: Ideally, content inside `display: none` elements should not be + * parsed. However, if such elements are parsed, it's best for their inline + * or block status to be left unchanged. + */ + if ( + ['initial', 'unset', 'revert', 'revert-layer', 'contents', 'none'].includes( + displayProperty + ) + ) { + return tagNameIsInline; + } + + return false; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isLastNonEmptyTextOfInlineFormattingBlock.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isLastNonEmptyTextOfInlineFormattingBlock.ts new file mode 100644 index 0000000000..44d613825e --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isLastNonEmptyTextOfInlineFormattingBlock.ts @@ -0,0 +1,39 @@ +import { isHtmlBlockElement } from './isHtmlBlockElement'; + +export const isLastNonEmptyTextOfInlineFormattingBlock = ( + initialText: Text +): boolean => { + let currentNode: Node | null = initialText; + + while (true) { + if (currentNode.nextSibling) { + currentNode = currentNode.nextSibling; + } else { + // If there is no next sibling, ascend to the parent node + currentNode = currentNode.parentElement; + // If the parent node is a block, we've reached the end + if (currentNode && isHtmlBlockElement(currentNode)) { + return true; + } + // Otherwise, continue to the next sibling of the parent node + currentNode = currentNode?.nextSibling || null; + } + + // If there's no next node, we've reached the end + if (!currentNode) { + return true; + } + + // If the next node is a block, we've reached the end + if (isHtmlBlockElement(currentNode)) { + return true; + } + + // If the next node is a non-empty text node, we're not at the end + if ((currentNode.textContent || '').length > 0) { + return false; + } + + // Otherwise, continue to the next node + } +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/stateTransforms.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/stateTransforms.ts new file mode 100644 index 0000000000..98fbee91c0 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/stateTransforms.ts @@ -0,0 +1,18 @@ +import { CollapseWhiteSpaceState } from './types'; + +export const upsertInlineFormattingContext = ( + state: CollapseWhiteSpaceState +) => { + if (state.inlineFormattingContext) { + state.inlineFormattingContext.atStart = false; + } else { + state.inlineFormattingContext = { + atStart: true, + lastHasTrailingWhiteSpace: false, + }; + } +}; + +export const endInlineFormattingContext = (state: CollapseWhiteSpaceState) => { + state.inlineFormattingContext = null; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/types.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/types.ts new file mode 100644 index 0000000000..a6286591d6 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/types.ts @@ -0,0 +1,17 @@ +/** + * Actual elements are treated differently, so track these as a separate + * rule. + */ +export type WhiteSpaceRule = 'normal' | 'actual-pre' | 'pre' | 'pre-line'; + +export type TrimStartRule = 'collapse' | 'all'; +export type TrimEndRule = 'collapse' | 'single-newline'; + +export type CollapseWhiteSpaceState = { + inlineFormattingContext: null | { + atStart: boolean; + lastHasTrailingWhiteSpace: boolean; + }; + + whiteSpaceRule: WhiteSpaceRule; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.ts b/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.ts deleted file mode 100644 index 825761befa..0000000000 --- a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.ts +++ /dev/null @@ -1,438 +0,0 @@ -import { isHtmlElement } from './isHtmlElement'; -import { isHtmlText } from './isHtmlText'; - -/** - * # Methodology: Standard Inline Elements - * - * ## Step 1. Get the list of all standard tag names - * - * Go to https://developer.mozilla.org/en-US/docs/Web/HTML/Element and run the - * following in the console to generate a JSON array of tag names: - * - * ```js - * JSON.stringify( - * Array.from(document.querySelectorAll('article table td:first-child')).map((td) => { - * const body = document.createElement('body'); - * body.innerHTML = td.textContent; - * return body.firstChild?.tagName; - * }).filter((tagName) => tagName) - * ); - * ``` - * - * Output (as of 2023-11-06): - * - * ```json - * '["BASE","LINK","META","STYLE","TITLE","ADDRESS","ARTICLE","ASIDE","FOOTER","HEADER","H1","HGROUP","MAIN","NAV","SECTION","SEARCH","BLOCKQUOTE","DD","DIV","DL","DT","FIGCAPTION","FIGURE","HR","LI","MENU","OL","P","PRE","UL","A","ABBR","B","BDI","BDO","BR","CITE","CODE","DATA","DFN","EM","I","KBD","MARK","Q","RP","RT","RUBY","S","SAMP","SMALL","SPAN","STRONG","SUB","SUP","TIME","U","VAR","WBR","AREA","AUDIO","IMG","MAP","TRACK","VIDEO","EMBED","IFRAME","OBJECT","PICTURE","PORTAL","SOURCE","svg","math","CANVAS","NOSCRIPT","SCRIPT","DEL","INS","TABLE","BUTTON","DATALIST","FIELDSET","FORM","INPUT","LABEL","LEGEND","METER","OPTGROUP","OPTION","OUTPUT","PROGRESS","SELECT","TEXTAREA","DETAILS","DIALOG","SUMMARY","SLOT","TEMPLATE","ACRONYM","BIG","CENTER","CONTENT","DIR","FONT","IMG","MARQUEE","MENUITEM","NOBR","NOEMBED","NOFRAMES","PARAM","PLAINTEXT","RB","RTC","SHADOW","STRIKE","TT","XMP"]' - * ``` - * - * ## Step 2. For each tag name, determine the default browser style - * - * Open an empty HTML file in the browser and run the following in the console: - * - * ```js - * const tagNames = JSON.parse(); - * - * JSON.stringify( - * tagNames.filter((tagName) => { - * const element = document.createElement(tagName); - * document.body.appendChild(element); - * const display = window.getComputedStyle(element).display; - * element.remove(); - * return display.startsWith('inline'); - * }) - * ); - * ``` - * - * Place the result in the array below (accurate as of 2023-11-06). - */ -const inlineTags = new Set([ - 'A', - 'ABBR', - 'B', - 'BDI', - 'BDO', - 'BR', - 'CITE', - 'CODE', - 'DATA', - 'DFN', - 'EM', - 'I', - 'KBD', - 'MARK', - 'Q', - 'S', - 'SAMP', - 'SMALL', - 'SPAN', - 'STRONG', - 'SUB', - 'SUP', - 'TIME', - 'U', - 'VAR', - 'WBR', - 'IMG', - 'MAP', - 'TRACK', - 'VIDEO', - 'EMBED', - 'IFRAME', - 'OBJECT', - 'PICTURE', - 'PORTAL', - 'SOURCE', - 'svg', - 'math', - 'CANVAS', - 'DEL', - 'INS', - 'BUTTON', - 'INPUT', - 'LABEL', - 'METER', - 'OUTPUT', - 'PROGRESS', - 'SELECT', - 'TEXTAREA', - 'ACRONYM', - 'BIG', - 'CONTENT', - 'FONT', - 'IMG', - 'MARQUEE', - 'MENUITEM', - 'NOBR', - 'SHADOW', - 'STRIKE', - 'TT', -]); - -/** - * Actual elements are treated differently, so track these as a separate - * rule. - */ -type WhiteSpaceRule = 'normal' | 'actual-pre' | 'pre' | 'pre-line'; - -type TrimStartRule = 'collapse' | 'all'; -type TrimEndRule = 'collapse' | 'single-newline'; - -type CollapseWhiteSpaceState = { - inlineFormattingContext: null | { - atStart: boolean; - lastHasTrailingWhiteSpace: boolean; - }; - - whiteSpaceRule: WhiteSpaceRule; -}; - -// Entrypoint -export const collapseWhiteSpace = (element: HTMLElement) => { - const clonedElement = element.cloneNode(true) as HTMLElement; - - // Mutable state object - const state: CollapseWhiteSpaceState = { - inlineFormattingContext: null, - whiteSpaceRule: 'normal', - }; - - collapseWhiteSpaceElement(clonedElement, state); - - return clonedElement; -}; - -// Recursive functions -const collapseWhiteSpaceNode = (node: Node, state: CollapseWhiteSpaceState) => { - if (isHtmlElement(node)) { - collapseWhiteSpaceElement(node as HTMLElement, state); - return; - } - - if (isHtmlText(node)) { - collapseWhiteSpaceText(node as Text, state); - return; - } - - collapseWhiteSpaceChildren(node, state); -}; - -const collapseWhiteSpaceChildren = ( - node: Node, - state: CollapseWhiteSpaceState -) => { - const childNodes = Array.from(node.childNodes); - - for (const childNode of childNodes) { - collapseWhiteSpaceNode(childNode, state); - } -}; - -const collapseWhiteSpaceElement = ( - element: HTMLElement, - state: CollapseWhiteSpaceState -) => { - const isInlineElement = isHtmlInlineElement(element); - const previousWhiteSpaceRule = state.whiteSpaceRule; - const inferredWhiteSpaceRule = inferWhiteSpaceRule(element); - - if (inferredWhiteSpaceRule) { - state.whiteSpaceRule = inferredWhiteSpaceRule; - } - - /** - * Note: We do not want to start an inline formatting context until we - * encounter a text node. - */ - - // End any existing inline formatting context - if (!isInlineElement) { - endInlineFormattingContext(state); - } - - collapseWhiteSpaceChildren(element, state); - - // Do not let inline formatting context break out of block elements - if (!isInlineElement) { - endInlineFormattingContext(state); - } - - // Restore previous whiteSpaceRule - state.whiteSpaceRule = previousWhiteSpaceRule; -}; - -const collapseWhiteSpaceText = (text: Text, state: CollapseWhiteSpaceState) => { - const textContent = text.textContent || ''; - const isWhiteSpaceOnly = textContent.trim() === ''; - - // Do not start an inline formatting context with a whiteSpace-only text node - if (state.inlineFormattingContext || !isWhiteSpaceOnly) { - upsertInlineFormattingContext(state); - } - - const { whiteSpaceRule } = state; - - /** - * Note: Due to the way HTML strings are parsed in htmlStringToDOMNode, up to - * one newline is already trimmed from the start of text nodes inside- * elements. If we do so again here, we may remove too many newlines. This - * only applies to actualelements, not elements with the white-space - * CSS property. - */ - const trimStart: TrimStartRule = (() => { - if (whiteSpaceRule !== 'normal') return 'collapse'; - - if ( - !state.inlineFormattingContext || - state.inlineFormattingContext.atStart || - state.inlineFormattingContext.lastHasTrailingWhiteSpace - ) - return 'all'; - - return 'collapse'; - })(); - - const trimEnd: TrimEndRule = (() => { - if (whiteSpaceRule === 'normal') return 'collapse'; - if (isLastNonEmptyTextOfInlineFormattingBlock(text)) - return 'single-newline'; - return 'collapse'; - })(); - - const shouldCollapseWhiteSpace: boolean = { - normal: true, - 'actual-pre': false, - pre: false, - 'pre-line': true, - }[whiteSpaceRule]; - - const whiteSpaceIncludesNewlines = whiteSpaceRule !== 'pre-line'; - - const collapsedTextContent = collapseString(textContent || '', { - trimStart, - trimEnd, - shouldCollapseWhiteSpace, - whiteSpaceIncludesNewlines, - }); - - if (state.inlineFormattingContext && shouldCollapseWhiteSpace) { - state.inlineFormattingContext.lastHasTrailingWhiteSpace = - collapsedTextContent.endsWith(' '); - } - - text.textContent = collapsedTextContent; -}; - -// Utilities -const collapseString = ( - text: string, - { - trimStart = 'collapse', - trimEnd = 'collapse', - shouldCollapseWhiteSpace = true, - whiteSpaceIncludesNewlines = true, - }: { - trimStart?: TrimStartRule; - trimEnd?: TrimEndRule; - shouldCollapseWhiteSpace?: boolean; - whiteSpaceIncludesNewlines?: boolean; - } = {} -) => { - if (trimStart === 'all') { - text = text.replace(/^\s+/, ''); - } - - if (trimEnd === 'single-newline') { - // Strip at most one newline from the end - text = text.replace(/\n$/, ''); - } - - if (shouldCollapseWhiteSpace) { - if (whiteSpaceIncludesNewlines) { - text = text.replaceAll(/\s+/g, ' '); - } else { - // Collapse horizontal whitespace - text = text.replaceAll(/[^\S\n\r]+/g, ' '); - - /** - * Trim horizontal whitespace from the start and end of lines (behavior - * of pre-line). - */ - text = text.replaceAll(/^[^\S\n\r]+/gm, ''); - text = text.replaceAll(/[^\S\n\r]+$/gm, ''); - } - } - - return text; -}; - -const inferWhiteSpaceRule = (element: HTMLElement): WhiteSpaceRule | null => { - const whiteSpaceProperty = element.style.whiteSpace; - - switch (whiteSpaceProperty) { - case 'normal': - case 'nowrap': { - return 'normal'; - } - case 'pre': - case 'pre-wrap': - case 'break-spaces': { - return 'pre'; - } - case 'pre-line': { - return 'pre-line'; - } - } - - if (element.tagName === 'PRE') { - return 'actual-pre'; - } - - if (whiteSpaceProperty === 'initial') { - return 'normal'; - } - - return null; -}; - -const isHtmlInlineElement = (node: Node): boolean => { - if (!isHtmlElement(node)) return false; - const element = node as HTMLElement; - - const tagNameIsInline = inlineTags.has(element.tagName); - - /** - * Valid display values include 'inline flow'. We only care about the first - * part. - */ - const displayProperty = element.style.display.split(' ')[0]; - - if (displayProperty === '') { - return tagNameIsInline; - } - - if (displayProperty.startsWith('inline')) { - return true; - } - - if (displayProperty === 'inherit' && element.parentElement) { - return isHtmlInlineElement(element.parentElement); - } - - /** - * Handle all special values manually, so that any unhandled values can be - * assumed to be block. - * - * Note: Ideally, content inside `display: none` elements should not be - * parsed. However, if such elements are parsed, it's best for their inline - * or block status to be left unchanged. - */ - if ( - ['initial', 'unset', 'revert', 'revert-layer', 'contents', 'none'].includes( - displayProperty - ) - ) { - return tagNameIsInline; - } - - return false; -}; - -const isHtmlBlockElement = (node: Node): boolean => { - if (!isHtmlElement(node)) return false; - const element = node as HTMLElement; - return !isHtmlInlineElement(element); -}; - -const isLastNonEmptyTextOfInlineFormattingBlock = ( - initialText: Text -): boolean => { - let currentNode: Node | null = initialText; - - while (true) { - if (currentNode.nextSibling) { - currentNode = currentNode.nextSibling; - } else { - // If there is no next sibling, ascend to the parent node - currentNode = currentNode.parentElement; - // If the parent node is a block, we've reached the end - if (currentNode && isHtmlBlockElement(currentNode)) { - return true; - } - // Otherwise, continue to the next sibling of the parent node - currentNode = currentNode?.nextSibling || null; - } - - // If there's no next node, we've reached the end - if (!currentNode) { - return true; - } - - // If the next node is a block, we've reached the end - if (isHtmlBlockElement(currentNode)) { - return true; - } - - // If the next node is a non-empty text node, we're not at the end - if ((currentNode.textContent || '').length > 0) { - return false; - } - - // Otherwise, continue to the next node - } -}; - -// State transforms -const upsertInlineFormattingContext = (state: CollapseWhiteSpaceState) => { - if (state.inlineFormattingContext) { - state.inlineFormattingContext.atStart = false; - } else { - state.inlineFormattingContext = { - atStart: true, - lastHasTrailingWhiteSpace: false, - }; - } -}; - -const endInlineFormattingContext = (state: CollapseWhiteSpaceState) => { - state.inlineFormattingContext = null; -}; diff --git a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts index 85bece6048..b6c2006884 100644 --- a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts +++ b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts @@ -2,7 +2,7 @@ import { EDescendant, Value } from '@udecode/slate'; import { PlateEditor } from '../../../types/PlateEditor'; import { normalizeDescendantsToDocumentFragment } from '../../../utils/normalizeDescendantsToDocumentFragment'; -import { collapseWhiteSpace } from './collapseWhiteSpace'; +import { collapseWhiteSpace } from './collapse-white-space'; import { deserializeHtmlElement } from './deserializeHtmlElement'; import { htmlStringToDOMNode } from './htmlStringToDOMNode'; diff --git a/packages/core/src/plugins/html-deserializer/utils/index.ts b/packages/core/src/plugins/html-deserializer/utils/index.ts index 804dec0cd1..1b8c9386e2 100644 --- a/packages/core/src/plugins/html-deserializer/utils/index.ts +++ b/packages/core/src/plugins/html-deserializer/utils/index.ts @@ -8,7 +8,6 @@ export * from './cleanHtmlEmptyElements'; export * from './cleanHtmlFontElements'; export * from './cleanHtmlLinkElements'; export * from './cleanHtmlTextNodes'; -export * from './collapseWhiteSpace'; export * from './copyBlockMarksToSpanChild'; export * from './deserializeHtml'; export * from './deserializeHtmlElement'; @@ -44,3 +43,4 @@ export * from './traverseHtmlElements'; export * from './traverseHtmlNode'; export * from './traverseHtmlTexts'; export * from './unwrapHtmlElement'; +export * from './collapse-white-space/index';