From ef155679c1bd6f913b4e4b06ce5543f23b7c50ae Mon Sep 17 00:00:00 2001 From: Joe Anderson Date: Mon, 6 Nov 2023 21:35:25 +0000 Subject: [PATCH] Refactor --- .../collapse-white-space/collapseString.ts | 43 ++ .../collapseWhiteSpace.spec.ts | 2 +- .../collapseWhiteSpace.ts | 17 + .../collapseWhiteSpaceChildren.ts | 13 + .../collapseWhiteSpaceElement.ts | 38 ++ .../collapseWhiteSpaceNode.ts | 23 + .../collapseWhiteSpaceText.ts | 69 +++ .../utils/collapse-white-space/index.ts | 17 + .../inferWhiteSpaceRule.ts | 32 ++ .../collapse-white-space/inlineTagNames.ts | 107 +++++ .../isHtmlBlockElement.ts | 8 + .../isHtmlInlineElement.ts | 45 ++ ...LastNonEmptyTextOfInlineFormattingBlock.ts | 39 ++ .../collapse-white-space/stateTransforms.ts | 18 + .../utils/collapse-white-space/types.ts | 17 + .../utils/collapseWhiteSpace.ts | 438 ------------------ .../utils/deserializeHtml.ts | 2 +- .../plugins/html-deserializer/utils/index.ts | 2 +- 18 files changed, 489 insertions(+), 441 deletions(-) create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseString.ts rename packages/core/src/plugins/html-deserializer/utils/{ => collapse-white-space}/collapseWhiteSpace.spec.ts (99%) create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceChildren.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceElement.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceNode.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceText.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/index.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inferWhiteSpaceRule.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inlineTagNames.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlBlockElement.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlInlineElement.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isLastNonEmptyTextOfInlineFormattingBlock.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/stateTransforms.ts create mode 100644 packages/core/src/plugins/html-deserializer/utils/collapse-white-space/types.ts delete mode 100644 packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.ts diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseString.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseString.ts new file mode 100644 index 0000000000..72557334b9 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseString.ts @@ -0,0 +1,43 @@ +import { TrimEndRule, TrimStartRule } from './types'; + +export const collapseString = ( + text: string, + { + trimStart = 'collapse', + trimEnd = 'collapse', + shouldCollapseWhiteSpace = true, + whiteSpaceIncludesNewlines = true, + }: { + trimStart?: TrimStartRule; + trimEnd?: TrimEndRule; + shouldCollapseWhiteSpace?: boolean; + whiteSpaceIncludesNewlines?: boolean; + } = {} +) => { + if (trimStart === 'all') { + text = text.replace(/^\s+/, ''); + } + + if (trimEnd === 'single-newline') { + // Strip at most one newline from the end + text = text.replace(/\n$/, ''); + } + + if (shouldCollapseWhiteSpace) { + if (whiteSpaceIncludesNewlines) { + text = text.replaceAll(/\s+/g, ' '); + } else { + // Collapse horizontal whitespace + text = text.replaceAll(/[^\S\n\r]+/g, ' '); + + /** + * Trim horizontal whitespace from the start and end of lines (behavior + * of pre-line). + */ + text = text.replaceAll(/^[^\S\n\r]+/gm, ''); + text = text.replaceAll(/[^\S\n\r]+$/gm, ''); + } + } + + return text; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.spec.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.spec.ts similarity index 99% rename from packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.spec.ts rename to packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.spec.ts index 592a1396e7..af564eeb1a 100644 --- a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.spec.ts +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.spec.ts @@ -1,5 +1,5 @@ +import { htmlStringToDOMNode } from '../htmlStringToDOMNode'; import { collapseWhiteSpace } from './collapseWhiteSpace'; -import { htmlStringToDOMNode } from './htmlStringToDOMNode'; const expectCollapsedWhiteSpace = (input: string, expected: string) => { const element = htmlStringToDOMNode(input); diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.ts new file mode 100644 index 0000000000..eed683d972 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpace.ts @@ -0,0 +1,17 @@ +import { collapseWhiteSpaceElement } from './collapseWhiteSpaceElement'; +import { CollapseWhiteSpaceState } from './types'; + +// Entrypoint +export const collapseWhiteSpace = (element: HTMLElement) => { + const clonedElement = element.cloneNode(true) as HTMLElement; + + // Mutable state object + const state: CollapseWhiteSpaceState = { + inlineFormattingContext: null, + whiteSpaceRule: 'normal', + }; + + collapseWhiteSpaceElement(clonedElement, state); + + return clonedElement; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceChildren.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceChildren.ts new file mode 100644 index 0000000000..9ecd62c4ad --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceChildren.ts @@ -0,0 +1,13 @@ +import { collapseWhiteSpaceNode } from './collapseWhiteSpaceNode'; +import { CollapseWhiteSpaceState } from './types'; + +export const collapseWhiteSpaceChildren = ( + node: Node, + state: CollapseWhiteSpaceState +) => { + const childNodes = Array.from(node.childNodes); + + for (const childNode of childNodes) { + collapseWhiteSpaceNode(childNode, state); + } +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceElement.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceElement.ts new file mode 100644 index 0000000000..be3ac6ca72 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceElement.ts @@ -0,0 +1,38 @@ +import { collapseWhiteSpaceChildren } from './collapseWhiteSpaceChildren'; +import { inferWhiteSpaceRule } from './inferWhiteSpaceRule'; +import { isHtmlInlineElement } from './isHtmlInlineElement'; +import { endInlineFormattingContext } from './stateTransforms'; +import { CollapseWhiteSpaceState } from './types'; + +export const collapseWhiteSpaceElement = ( + element: HTMLElement, + state: CollapseWhiteSpaceState +) => { + const isInlineElement = isHtmlInlineElement(element); + const previousWhiteSpaceRule = state.whiteSpaceRule; + const inferredWhiteSpaceRule = inferWhiteSpaceRule(element); + + if (inferredWhiteSpaceRule) { + state.whiteSpaceRule = inferredWhiteSpaceRule; + } + + /** + * Note: We do not want to start an inline formatting context until we + * encounter a text node. + */ + + // End any existing inline formatting context + if (!isInlineElement) { + endInlineFormattingContext(state); + } + + collapseWhiteSpaceChildren(element, state); + + // Do not let inline formatting context break out of block elements + if (!isInlineElement) { + endInlineFormattingContext(state); + } + + // Restore previous whiteSpaceRule + state.whiteSpaceRule = previousWhiteSpaceRule; +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceNode.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceNode.ts new file mode 100644 index 0000000000..3b6ce09e32 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceNode.ts @@ -0,0 +1,23 @@ +import { isHtmlElement } from '../isHtmlElement'; +import { isHtmlText } from '../isHtmlText'; +import { collapseWhiteSpaceChildren } from './collapseWhiteSpaceChildren'; +import { collapseWhiteSpaceElement } from './collapseWhiteSpaceElement'; +import { collapseWhiteSpaceText } from './collapseWhiteSpaceText'; +import { CollapseWhiteSpaceState } from './types'; + +export const collapseWhiteSpaceNode = ( + node: Node, + state: CollapseWhiteSpaceState +) => { + if (isHtmlElement(node)) { + collapseWhiteSpaceElement(node as HTMLElement, state); + return; + } + + if (isHtmlText(node)) { + collapseWhiteSpaceText(node as Text, state); + return; + } + + collapseWhiteSpaceChildren(node, state); +}; diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceText.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceText.ts new file mode 100644 index 0000000000..2288681fe9 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/collapseWhiteSpaceText.ts @@ -0,0 +1,69 @@ +import { collapseString } from './collapseString'; +import { isLastNonEmptyTextOfInlineFormattingBlock } from './isLastNonEmptyTextOfInlineFormattingBlock'; +import { upsertInlineFormattingContext } from './stateTransforms'; +import { CollapseWhiteSpaceState, TrimEndRule, TrimStartRule } from './types'; + +export const collapseWhiteSpaceText = ( + text: Text, + state: CollapseWhiteSpaceState +) => { + const textContent = text.textContent || ''; + const isWhiteSpaceOnly = textContent.trim() === ''; + + // Do not start an inline formatting context with a whiteSpace-only text node + if (state.inlineFormattingContext || !isWhiteSpaceOnly) { + upsertInlineFormattingContext(state); + } + + const { whiteSpaceRule } = state; + + /** + * Note: Due to the way HTML strings are parsed in htmlStringToDOMNode, up to + * one newline is already trimmed from the start of text nodes inside
+   * elements. If we do so again here, we may remove too many newlines. This
+   * only applies to actual 
 elements, not elements with the white-space
+   * CSS property.
+   */
+  const trimStart: TrimStartRule = (() => {
+    if (whiteSpaceRule !== 'normal') return 'collapse';
+
+    if (
+      !state.inlineFormattingContext ||
+      state.inlineFormattingContext.atStart ||
+      state.inlineFormattingContext.lastHasTrailingWhiteSpace
+    )
+      return 'all';
+
+    return 'collapse';
+  })();
+
+  const trimEnd: TrimEndRule = (() => {
+    if (whiteSpaceRule === 'normal') return 'collapse';
+    if (isLastNonEmptyTextOfInlineFormattingBlock(text))
+      return 'single-newline';
+    return 'collapse';
+  })();
+
+  const shouldCollapseWhiteSpace: boolean = {
+    normal: true,
+    'actual-pre': false,
+    pre: false,
+    'pre-line': true,
+  }[whiteSpaceRule];
+
+  const whiteSpaceIncludesNewlines = whiteSpaceRule !== 'pre-line';
+
+  const collapsedTextContent = collapseString(textContent || '', {
+    trimStart,
+    trimEnd,
+    shouldCollapseWhiteSpace,
+    whiteSpaceIncludesNewlines,
+  });
+
+  if (state.inlineFormattingContext && shouldCollapseWhiteSpace) {
+    state.inlineFormattingContext.lastHasTrailingWhiteSpace =
+      collapsedTextContent.endsWith(' ');
+  }
+
+  text.textContent = collapsedTextContent;
+};
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/index.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/index.ts
new file mode 100644
index 0000000000..701919be7d
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/index.ts
@@ -0,0 +1,17 @@
+/**
+ * @file Automatically generated by barrelsby.
+ */
+
+export * from './collapseString';
+export * from './collapseWhiteSpace';
+export * from './collapseWhiteSpaceChildren';
+export * from './collapseWhiteSpaceElement';
+export * from './collapseWhiteSpaceNode';
+export * from './collapseWhiteSpaceText';
+export * from './inferWhiteSpaceRule';
+export * from './inlineTagNames';
+export * from './isHtmlBlockElement';
+export * from './isHtmlInlineElement';
+export * from './isLastNonEmptyTextOfInlineFormattingBlock';
+export * from './stateTransforms';
+export * from './types';
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inferWhiteSpaceRule.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inferWhiteSpaceRule.ts
new file mode 100644
index 0000000000..cd9bab8462
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inferWhiteSpaceRule.ts
@@ -0,0 +1,32 @@
+import { WhiteSpaceRule } from './types';
+
+export const inferWhiteSpaceRule = (
+  element: HTMLElement
+): WhiteSpaceRule | null => {
+  const whiteSpaceProperty = element.style.whiteSpace;
+
+  switch (whiteSpaceProperty) {
+    case 'normal':
+    case 'nowrap': {
+      return 'normal';
+    }
+    case 'pre':
+    case 'pre-wrap':
+    case 'break-spaces': {
+      return 'pre';
+    }
+    case 'pre-line': {
+      return 'pre-line';
+    }
+  }
+
+  if (element.tagName === 'PRE') {
+    return 'actual-pre';
+  }
+
+  if (whiteSpaceProperty === 'initial') {
+    return 'normal';
+  }
+
+  return null;
+};
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inlineTagNames.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inlineTagNames.ts
new file mode 100644
index 0000000000..87883426fe
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/inlineTagNames.ts
@@ -0,0 +1,107 @@
+/**
+ * # Methodology
+ *
+ * ## Step 1. Get the list of all standard tag names
+ *
+ * Go to https://developer.mozilla.org/en-US/docs/Web/HTML/Element and run the
+ * following in the console to generate a JSON array of tag names:
+ *
+ * ```js
+ * JSON.stringify(
+ *   Array.from(document.querySelectorAll('article table td:first-child')).map((td) => {
+ *     const body = document.createElement('body');
+ *     body.innerHTML = td.textContent;
+ *     return body.firstChild?.tagName;
+ *   }).filter((tagName) => tagName)
+ * );
+ * ```
+ *
+ * Output (as of 2023-11-06):
+ *
+ * ```json
+ * '["BASE","LINK","META","STYLE","TITLE","ADDRESS","ARTICLE","ASIDE","FOOTER","HEADER","H1","HGROUP","MAIN","NAV","SECTION","SEARCH","BLOCKQUOTE","DD","DIV","DL","DT","FIGCAPTION","FIGURE","HR","LI","MENU","OL","P","PRE","UL","A","ABBR","B","BDI","BDO","BR","CITE","CODE","DATA","DFN","EM","I","KBD","MARK","Q","RP","RT","RUBY","S","SAMP","SMALL","SPAN","STRONG","SUB","SUP","TIME","U","VAR","WBR","AREA","AUDIO","IMG","MAP","TRACK","VIDEO","EMBED","IFRAME","OBJECT","PICTURE","PORTAL","SOURCE","svg","math","CANVAS","NOSCRIPT","SCRIPT","DEL","INS","TABLE","BUTTON","DATALIST","FIELDSET","FORM","INPUT","LABEL","LEGEND","METER","OPTGROUP","OPTION","OUTPUT","PROGRESS","SELECT","TEXTAREA","DETAILS","DIALOG","SUMMARY","SLOT","TEMPLATE","ACRONYM","BIG","CENTER","CONTENT","DIR","FONT","IMG","MARQUEE","MENUITEM","NOBR","NOEMBED","NOFRAMES","PARAM","PLAINTEXT","RB","RTC","SHADOW","STRIKE","TT","XMP"]'
+ * ```
+ *
+ * ## Step 2. For each tag name, determine the default browser style
+ *
+ * Open an empty HTML file in the browser and run the following in the console:
+ *
+ * ```js
+ * const tagNames = JSON.parse();
+ *
+ * JSON.stringify(
+ *   tagNames.filter((tagName) => {
+ *     const element = document.createElement(tagName);
+ *     document.body.appendChild(element);
+ *     const display = window.getComputedStyle(element).display;
+ *     element.remove();
+ *     return display.startsWith('inline');
+ *   })
+ * );
+ * ```
+ *
+ * Place the result in the array below (accurate as of 2023-11-06).
+ */
+
+export const inlineTagNames = new Set([
+  'A',
+  'ABBR',
+  'B',
+  'BDI',
+  'BDO',
+  'BR',
+  'CITE',
+  'CODE',
+  'DATA',
+  'DFN',
+  'EM',
+  'I',
+  'KBD',
+  'MARK',
+  'Q',
+  'S',
+  'SAMP',
+  'SMALL',
+  'SPAN',
+  'STRONG',
+  'SUB',
+  'SUP',
+  'TIME',
+  'U',
+  'VAR',
+  'WBR',
+  'IMG',
+  'MAP',
+  'TRACK',
+  'VIDEO',
+  'EMBED',
+  'IFRAME',
+  'OBJECT',
+  'PICTURE',
+  'PORTAL',
+  'SOURCE',
+  'svg',
+  'math',
+  'CANVAS',
+  'DEL',
+  'INS',
+  'BUTTON',
+  'INPUT',
+  'LABEL',
+  'METER',
+  'OUTPUT',
+  'PROGRESS',
+  'SELECT',
+  'TEXTAREA',
+  'ACRONYM',
+  'BIG',
+  'CONTENT',
+  'FONT',
+  'IMG',
+  'MARQUEE',
+  'MENUITEM',
+  'NOBR',
+  'SHADOW',
+  'STRIKE',
+  'TT',
+]);
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlBlockElement.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlBlockElement.ts
new file mode 100644
index 0000000000..d6260a62a0
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlBlockElement.ts
@@ -0,0 +1,8 @@
+import { isHtmlElement } from '../isHtmlElement';
+import { isHtmlInlineElement } from './isHtmlInlineElement';
+
+export const isHtmlBlockElement = (node: Node): boolean => {
+  if (!isHtmlElement(node)) return false;
+  const element = node as HTMLElement;
+  return !isHtmlInlineElement(element);
+};
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlInlineElement.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlInlineElement.ts
new file mode 100644
index 0000000000..5e8557218c
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isHtmlInlineElement.ts
@@ -0,0 +1,45 @@
+import { isHtmlElement } from '../isHtmlElement';
+import { inlineTagNames } from './inlineTagNames';
+
+export const isHtmlInlineElement = (node: Node): boolean => {
+  if (!isHtmlElement(node)) return false;
+  const element = node as HTMLElement;
+
+  const tagNameIsInline = inlineTagNames.has(element.tagName);
+
+  /**
+   * Valid display values include 'inline flow'. We only care about the first
+   * part.
+   */
+  const displayProperty = element.style.display.split(' ')[0];
+
+  if (displayProperty === '') {
+    return tagNameIsInline;
+  }
+
+  if (displayProperty.startsWith('inline')) {
+    return true;
+  }
+
+  if (displayProperty === 'inherit' && element.parentElement) {
+    return isHtmlInlineElement(element.parentElement);
+  }
+
+  /**
+   * Handle all special values manually, so that any unhandled values can be
+   * assumed to be block.
+   *
+   * Note: Ideally, content inside `display: none` elements should not be
+   * parsed. However, if such elements are parsed, it's best for their inline
+   * or block status to be left unchanged.
+   */
+  if (
+    ['initial', 'unset', 'revert', 'revert-layer', 'contents', 'none'].includes(
+      displayProperty
+    )
+  ) {
+    return tagNameIsInline;
+  }
+
+  return false;
+};
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isLastNonEmptyTextOfInlineFormattingBlock.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isLastNonEmptyTextOfInlineFormattingBlock.ts
new file mode 100644
index 0000000000..44d613825e
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/isLastNonEmptyTextOfInlineFormattingBlock.ts
@@ -0,0 +1,39 @@
+import { isHtmlBlockElement } from './isHtmlBlockElement';
+
+export const isLastNonEmptyTextOfInlineFormattingBlock = (
+  initialText: Text
+): boolean => {
+  let currentNode: Node | null = initialText;
+
+  while (true) {
+    if (currentNode.nextSibling) {
+      currentNode = currentNode.nextSibling;
+    } else {
+      // If there is no next sibling, ascend to the parent node
+      currentNode = currentNode.parentElement;
+      // If the parent node is a block, we've reached the end
+      if (currentNode && isHtmlBlockElement(currentNode)) {
+        return true;
+      }
+      // Otherwise, continue to the next sibling of the parent node
+      currentNode = currentNode?.nextSibling || null;
+    }
+
+    // If there's no next node, we've reached the end
+    if (!currentNode) {
+      return true;
+    }
+
+    // If the next node is a block, we've reached the end
+    if (isHtmlBlockElement(currentNode)) {
+      return true;
+    }
+
+    // If the next node is a non-empty text node, we're not at the end
+    if ((currentNode.textContent || '').length > 0) {
+      return false;
+    }
+
+    // Otherwise, continue to the next node
+  }
+};
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/stateTransforms.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/stateTransforms.ts
new file mode 100644
index 0000000000..98fbee91c0
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/stateTransforms.ts
@@ -0,0 +1,18 @@
+import { CollapseWhiteSpaceState } from './types';
+
+export const upsertInlineFormattingContext = (
+  state: CollapseWhiteSpaceState
+) => {
+  if (state.inlineFormattingContext) {
+    state.inlineFormattingContext.atStart = false;
+  } else {
+    state.inlineFormattingContext = {
+      atStart: true,
+      lastHasTrailingWhiteSpace: false,
+    };
+  }
+};
+
+export const endInlineFormattingContext = (state: CollapseWhiteSpaceState) => {
+  state.inlineFormattingContext = null;
+};
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/types.ts b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/types.ts
new file mode 100644
index 0000000000..a6286591d6
--- /dev/null
+++ b/packages/core/src/plugins/html-deserializer/utils/collapse-white-space/types.ts
@@ -0,0 +1,17 @@
+/**
+ * Actual 
 elements are treated differently, so track these as a separate
+ * rule.
+ */
+export type WhiteSpaceRule = 'normal' | 'actual-pre' | 'pre' | 'pre-line';
+
+export type TrimStartRule = 'collapse' | 'all';
+export type TrimEndRule = 'collapse' | 'single-newline';
+
+export type CollapseWhiteSpaceState = {
+  inlineFormattingContext: null | {
+    atStart: boolean;
+    lastHasTrailingWhiteSpace: boolean;
+  };
+
+  whiteSpaceRule: WhiteSpaceRule;
+};
diff --git a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.ts b/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.ts
deleted file mode 100644
index 825761befa..0000000000
--- a/packages/core/src/plugins/html-deserializer/utils/collapseWhiteSpace.ts
+++ /dev/null
@@ -1,438 +0,0 @@
-import { isHtmlElement } from './isHtmlElement';
-import { isHtmlText } from './isHtmlText';
-
-/**
- * # Methodology: Standard Inline Elements
- *
- * ## Step 1. Get the list of all standard tag names
- *
- * Go to https://developer.mozilla.org/en-US/docs/Web/HTML/Element and run the
- * following in the console to generate a JSON array of tag names:
- *
- * ```js
- * JSON.stringify(
- *   Array.from(document.querySelectorAll('article table td:first-child')).map((td) => {
- *     const body = document.createElement('body');
- *     body.innerHTML = td.textContent;
- *     return body.firstChild?.tagName;
- *   }).filter((tagName) => tagName)
- * );
- * ```
- *
- * Output (as of 2023-11-06):
- *
- * ```json
- * '["BASE","LINK","META","STYLE","TITLE","ADDRESS","ARTICLE","ASIDE","FOOTER","HEADER","H1","HGROUP","MAIN","NAV","SECTION","SEARCH","BLOCKQUOTE","DD","DIV","DL","DT","FIGCAPTION","FIGURE","HR","LI","MENU","OL","P","PRE","UL","A","ABBR","B","BDI","BDO","BR","CITE","CODE","DATA","DFN","EM","I","KBD","MARK","Q","RP","RT","RUBY","S","SAMP","SMALL","SPAN","STRONG","SUB","SUP","TIME","U","VAR","WBR","AREA","AUDIO","IMG","MAP","TRACK","VIDEO","EMBED","IFRAME","OBJECT","PICTURE","PORTAL","SOURCE","svg","math","CANVAS","NOSCRIPT","SCRIPT","DEL","INS","TABLE","BUTTON","DATALIST","FIELDSET","FORM","INPUT","LABEL","LEGEND","METER","OPTGROUP","OPTION","OUTPUT","PROGRESS","SELECT","TEXTAREA","DETAILS","DIALOG","SUMMARY","SLOT","TEMPLATE","ACRONYM","BIG","CENTER","CONTENT","DIR","FONT","IMG","MARQUEE","MENUITEM","NOBR","NOEMBED","NOFRAMES","PARAM","PLAINTEXT","RB","RTC","SHADOW","STRIKE","TT","XMP"]'
- * ```
- *
- * ## Step 2. For each tag name, determine the default browser style
- *
- * Open an empty HTML file in the browser and run the following in the console:
- *
- * ```js
- * const tagNames = JSON.parse();
- *
- * JSON.stringify(
- *   tagNames.filter((tagName) => {
- *     const element = document.createElement(tagName);
- *     document.body.appendChild(element);
- *     const display = window.getComputedStyle(element).display;
- *     element.remove();
- *     return display.startsWith('inline');
- *   })
- * );
- * ```
- *
- * Place the result in the array below (accurate as of 2023-11-06).
- */
-const inlineTags = new Set([
-  'A',
-  'ABBR',
-  'B',
-  'BDI',
-  'BDO',
-  'BR',
-  'CITE',
-  'CODE',
-  'DATA',
-  'DFN',
-  'EM',
-  'I',
-  'KBD',
-  'MARK',
-  'Q',
-  'S',
-  'SAMP',
-  'SMALL',
-  'SPAN',
-  'STRONG',
-  'SUB',
-  'SUP',
-  'TIME',
-  'U',
-  'VAR',
-  'WBR',
-  'IMG',
-  'MAP',
-  'TRACK',
-  'VIDEO',
-  'EMBED',
-  'IFRAME',
-  'OBJECT',
-  'PICTURE',
-  'PORTAL',
-  'SOURCE',
-  'svg',
-  'math',
-  'CANVAS',
-  'DEL',
-  'INS',
-  'BUTTON',
-  'INPUT',
-  'LABEL',
-  'METER',
-  'OUTPUT',
-  'PROGRESS',
-  'SELECT',
-  'TEXTAREA',
-  'ACRONYM',
-  'BIG',
-  'CONTENT',
-  'FONT',
-  'IMG',
-  'MARQUEE',
-  'MENUITEM',
-  'NOBR',
-  'SHADOW',
-  'STRIKE',
-  'TT',
-]);
-
-/**
- * Actual 
 elements are treated differently, so track these as a separate
- * rule.
- */
-type WhiteSpaceRule = 'normal' | 'actual-pre' | 'pre' | 'pre-line';
-
-type TrimStartRule = 'collapse' | 'all';
-type TrimEndRule = 'collapse' | 'single-newline';
-
-type CollapseWhiteSpaceState = {
-  inlineFormattingContext: null | {
-    atStart: boolean;
-    lastHasTrailingWhiteSpace: boolean;
-  };
-
-  whiteSpaceRule: WhiteSpaceRule;
-};
-
-// Entrypoint
-export const collapseWhiteSpace = (element: HTMLElement) => {
-  const clonedElement = element.cloneNode(true) as HTMLElement;
-
-  // Mutable state object
-  const state: CollapseWhiteSpaceState = {
-    inlineFormattingContext: null,
-    whiteSpaceRule: 'normal',
-  };
-
-  collapseWhiteSpaceElement(clonedElement, state);
-
-  return clonedElement;
-};
-
-// Recursive functions
-const collapseWhiteSpaceNode = (node: Node, state: CollapseWhiteSpaceState) => {
-  if (isHtmlElement(node)) {
-    collapseWhiteSpaceElement(node as HTMLElement, state);
-    return;
-  }
-
-  if (isHtmlText(node)) {
-    collapseWhiteSpaceText(node as Text, state);
-    return;
-  }
-
-  collapseWhiteSpaceChildren(node, state);
-};
-
-const collapseWhiteSpaceChildren = (
-  node: Node,
-  state: CollapseWhiteSpaceState
-) => {
-  const childNodes = Array.from(node.childNodes);
-
-  for (const childNode of childNodes) {
-    collapseWhiteSpaceNode(childNode, state);
-  }
-};
-
-const collapseWhiteSpaceElement = (
-  element: HTMLElement,
-  state: CollapseWhiteSpaceState
-) => {
-  const isInlineElement = isHtmlInlineElement(element);
-  const previousWhiteSpaceRule = state.whiteSpaceRule;
-  const inferredWhiteSpaceRule = inferWhiteSpaceRule(element);
-
-  if (inferredWhiteSpaceRule) {
-    state.whiteSpaceRule = inferredWhiteSpaceRule;
-  }
-
-  /**
-   * Note: We do not want to start an inline formatting context until we
-   * encounter a text node.
-   */
-
-  // End any existing inline formatting context
-  if (!isInlineElement) {
-    endInlineFormattingContext(state);
-  }
-
-  collapseWhiteSpaceChildren(element, state);
-
-  // Do not let inline formatting context break out of block elements
-  if (!isInlineElement) {
-    endInlineFormattingContext(state);
-  }
-
-  // Restore previous whiteSpaceRule
-  state.whiteSpaceRule = previousWhiteSpaceRule;
-};
-
-const collapseWhiteSpaceText = (text: Text, state: CollapseWhiteSpaceState) => {
-  const textContent = text.textContent || '';
-  const isWhiteSpaceOnly = textContent.trim() === '';
-
-  // Do not start an inline formatting context with a whiteSpace-only text node
-  if (state.inlineFormattingContext || !isWhiteSpaceOnly) {
-    upsertInlineFormattingContext(state);
-  }
-
-  const { whiteSpaceRule } = state;
-
-  /**
-   * Note: Due to the way HTML strings are parsed in htmlStringToDOMNode, up to
-   * one newline is already trimmed from the start of text nodes inside 
-   * elements. If we do so again here, we may remove too many newlines. This
-   * only applies to actual 
 elements, not elements with the white-space
-   * CSS property.
-   */
-  const trimStart: TrimStartRule = (() => {
-    if (whiteSpaceRule !== 'normal') return 'collapse';
-
-    if (
-      !state.inlineFormattingContext ||
-      state.inlineFormattingContext.atStart ||
-      state.inlineFormattingContext.lastHasTrailingWhiteSpace
-    )
-      return 'all';
-
-    return 'collapse';
-  })();
-
-  const trimEnd: TrimEndRule = (() => {
-    if (whiteSpaceRule === 'normal') return 'collapse';
-    if (isLastNonEmptyTextOfInlineFormattingBlock(text))
-      return 'single-newline';
-    return 'collapse';
-  })();
-
-  const shouldCollapseWhiteSpace: boolean = {
-    normal: true,
-    'actual-pre': false,
-    pre: false,
-    'pre-line': true,
-  }[whiteSpaceRule];
-
-  const whiteSpaceIncludesNewlines = whiteSpaceRule !== 'pre-line';
-
-  const collapsedTextContent = collapseString(textContent || '', {
-    trimStart,
-    trimEnd,
-    shouldCollapseWhiteSpace,
-    whiteSpaceIncludesNewlines,
-  });
-
-  if (state.inlineFormattingContext && shouldCollapseWhiteSpace) {
-    state.inlineFormattingContext.lastHasTrailingWhiteSpace =
-      collapsedTextContent.endsWith(' ');
-  }
-
-  text.textContent = collapsedTextContent;
-};
-
-// Utilities
-const collapseString = (
-  text: string,
-  {
-    trimStart = 'collapse',
-    trimEnd = 'collapse',
-    shouldCollapseWhiteSpace = true,
-    whiteSpaceIncludesNewlines = true,
-  }: {
-    trimStart?: TrimStartRule;
-    trimEnd?: TrimEndRule;
-    shouldCollapseWhiteSpace?: boolean;
-    whiteSpaceIncludesNewlines?: boolean;
-  } = {}
-) => {
-  if (trimStart === 'all') {
-    text = text.replace(/^\s+/, '');
-  }
-
-  if (trimEnd === 'single-newline') {
-    // Strip at most one newline from the end
-    text = text.replace(/\n$/, '');
-  }
-
-  if (shouldCollapseWhiteSpace) {
-    if (whiteSpaceIncludesNewlines) {
-      text = text.replaceAll(/\s+/g, ' ');
-    } else {
-      // Collapse horizontal whitespace
-      text = text.replaceAll(/[^\S\n\r]+/g, ' ');
-
-      /**
-       * Trim horizontal whitespace from the start and end of lines (behavior
-       * of pre-line).
-       */
-      text = text.replaceAll(/^[^\S\n\r]+/gm, '');
-      text = text.replaceAll(/[^\S\n\r]+$/gm, '');
-    }
-  }
-
-  return text;
-};
-
-const inferWhiteSpaceRule = (element: HTMLElement): WhiteSpaceRule | null => {
-  const whiteSpaceProperty = element.style.whiteSpace;
-
-  switch (whiteSpaceProperty) {
-    case 'normal':
-    case 'nowrap': {
-      return 'normal';
-    }
-    case 'pre':
-    case 'pre-wrap':
-    case 'break-spaces': {
-      return 'pre';
-    }
-    case 'pre-line': {
-      return 'pre-line';
-    }
-  }
-
-  if (element.tagName === 'PRE') {
-    return 'actual-pre';
-  }
-
-  if (whiteSpaceProperty === 'initial') {
-    return 'normal';
-  }
-
-  return null;
-};
-
-const isHtmlInlineElement = (node: Node): boolean => {
-  if (!isHtmlElement(node)) return false;
-  const element = node as HTMLElement;
-
-  const tagNameIsInline = inlineTags.has(element.tagName);
-
-  /**
-   * Valid display values include 'inline flow'. We only care about the first
-   * part.
-   */
-  const displayProperty = element.style.display.split(' ')[0];
-
-  if (displayProperty === '') {
-    return tagNameIsInline;
-  }
-
-  if (displayProperty.startsWith('inline')) {
-    return true;
-  }
-
-  if (displayProperty === 'inherit' && element.parentElement) {
-    return isHtmlInlineElement(element.parentElement);
-  }
-
-  /**
-   * Handle all special values manually, so that any unhandled values can be
-   * assumed to be block.
-   *
-   * Note: Ideally, content inside `display: none` elements should not be
-   * parsed. However, if such elements are parsed, it's best for their inline
-   * or block status to be left unchanged.
-   */
-  if (
-    ['initial', 'unset', 'revert', 'revert-layer', 'contents', 'none'].includes(
-      displayProperty
-    )
-  ) {
-    return tagNameIsInline;
-  }
-
-  return false;
-};
-
-const isHtmlBlockElement = (node: Node): boolean => {
-  if (!isHtmlElement(node)) return false;
-  const element = node as HTMLElement;
-  return !isHtmlInlineElement(element);
-};
-
-const isLastNonEmptyTextOfInlineFormattingBlock = (
-  initialText: Text
-): boolean => {
-  let currentNode: Node | null = initialText;
-
-  while (true) {
-    if (currentNode.nextSibling) {
-      currentNode = currentNode.nextSibling;
-    } else {
-      // If there is no next sibling, ascend to the parent node
-      currentNode = currentNode.parentElement;
-      // If the parent node is a block, we've reached the end
-      if (currentNode && isHtmlBlockElement(currentNode)) {
-        return true;
-      }
-      // Otherwise, continue to the next sibling of the parent node
-      currentNode = currentNode?.nextSibling || null;
-    }
-
-    // If there's no next node, we've reached the end
-    if (!currentNode) {
-      return true;
-    }
-
-    // If the next node is a block, we've reached the end
-    if (isHtmlBlockElement(currentNode)) {
-      return true;
-    }
-
-    // If the next node is a non-empty text node, we're not at the end
-    if ((currentNode.textContent || '').length > 0) {
-      return false;
-    }
-
-    // Otherwise, continue to the next node
-  }
-};
-
-// State transforms
-const upsertInlineFormattingContext = (state: CollapseWhiteSpaceState) => {
-  if (state.inlineFormattingContext) {
-    state.inlineFormattingContext.atStart = false;
-  } else {
-    state.inlineFormattingContext = {
-      atStart: true,
-      lastHasTrailingWhiteSpace: false,
-    };
-  }
-};
-
-const endInlineFormattingContext = (state: CollapseWhiteSpaceState) => {
-  state.inlineFormattingContext = null;
-};
diff --git a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts
index 85bece6048..b6c2006884 100644
--- a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts
+++ b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts
@@ -2,7 +2,7 @@ import { EDescendant, Value } from '@udecode/slate';
 
 import { PlateEditor } from '../../../types/PlateEditor';
 import { normalizeDescendantsToDocumentFragment } from '../../../utils/normalizeDescendantsToDocumentFragment';
-import { collapseWhiteSpace } from './collapseWhiteSpace';
+import { collapseWhiteSpace } from './collapse-white-space';
 import { deserializeHtmlElement } from './deserializeHtmlElement';
 import { htmlStringToDOMNode } from './htmlStringToDOMNode';
 
diff --git a/packages/core/src/plugins/html-deserializer/utils/index.ts b/packages/core/src/plugins/html-deserializer/utils/index.ts
index 804dec0cd1..1b8c9386e2 100644
--- a/packages/core/src/plugins/html-deserializer/utils/index.ts
+++ b/packages/core/src/plugins/html-deserializer/utils/index.ts
@@ -8,7 +8,6 @@ export * from './cleanHtmlEmptyElements';
 export * from './cleanHtmlFontElements';
 export * from './cleanHtmlLinkElements';
 export * from './cleanHtmlTextNodes';
-export * from './collapseWhiteSpace';
 export * from './copyBlockMarksToSpanChild';
 export * from './deserializeHtml';
 export * from './deserializeHtmlElement';
@@ -44,3 +43,4 @@ export * from './traverseHtmlElements';
 export * from './traverseHtmlNode';
 export * from './traverseHtmlTexts';
 export * from './unwrapHtmlElement';
+export * from './collapse-white-space/index';