diff --git a/.changeset/flat-games-fetch.md b/.changeset/flat-games-fetch.md new file mode 100644 index 0000000000..f3678044bb --- /dev/null +++ b/.changeset/flat-games-fetch.md @@ -0,0 +1,5 @@ +--- +'@udecode/plate-core': major +--- + +Resolve the issue of whitespace in pasted HTML not being handled according to the HTML specification. diff --git a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.spec.tsx b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.spec.tsx index fa5976ed03..8f7256d970 100644 --- a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.spec.tsx +++ b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.spec.tsx @@ -232,3 +232,72 @@ describe('when deserializing all plugins', () => { ).toEqual(output.children); }); }); + +describe('when stripWhitespace is true', () => { + // https://github.com/udecode/plate/issues/2713#issuecomment-1780118687 + const html = `

\n Hello world\n

\n\n

\n one two \n three\n

\n\n
\nhello     one two\nthree\nfour\n
\n\n
\nhello one two\nthree\nfour\n
\n\n
\nhello one two\nthree\nfour\n
`; + const element = getHtmlDocument(html).body; + + const expectedOutput = [ + { + text: 'Hello world', + }, + { + text: 'one two three', + }, + { + text: 'hello one two\nthree\nfour', + }, + { + text: '\nhello one two\nthree\nfour', + }, + { + text: '\nhello one two\nthree\nfour', + }, + ]; + + it('should strip Whitespace by style', () => { + const convertedDocumentFragment = deserializeHtml(createPlateEditor(), { + element, + stripWhitespace: true, + }); + + expect(convertedDocumentFragment).toEqual(expectedOutput); + }); + + it('should strip Whitespace Normal start', () => { + const convertedDocumentFragment = deserializeHtml(createPlateEditor(), { + element: getHtmlDocument('

Hello world

').body, + stripWhitespace: true, + }); + + expect(convertedDocumentFragment).toEqual([ + { text: 'Hello ' }, + { text: 'world' }, + ]); + }); + + it('should strip Whitespace Normal end', () => { + const convertedDocumentFragment = deserializeHtml(createPlateEditor(), { + element: getHtmlDocument('

Hello world

').body, + stripWhitespace: true, + }); + + expect(convertedDocumentFragment).toEqual([ + { text: 'Hello ' }, + { text: 'world' }, + ]); + }); + + it('should strip Whitespace by
', () => {
+    const convertedDocumentFragment = deserializeHtml(createPlateEditor(), {
+      element: getHtmlDocument('
\nhello     one two\nthree\nfour\n
') + .body, + stripWhitespace: true, + }); + + expect(convertedDocumentFragment).toEqual([ + { text: 'hello one two\nthree\nfour' }, + ]); + }); +}); diff --git a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts index 20e9c0b4cc..96717d2fc4 100644 --- a/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts +++ b/packages/core/src/plugins/html-deserializer/utils/deserializeHtml.ts @@ -4,6 +4,7 @@ import { PlateEditor } from '../../../types/PlateEditor'; import { normalizeDescendantsToDocumentFragment } from '../../../utils/normalizeDescendantsToDocumentFragment'; import { deserializeHtmlElement } from './deserializeHtmlElement'; import { htmlStringToDOMNode } from './htmlStringToDOMNode'; +import { stripWhitespace as stripWhitespaceFunction } from './stripWhitespace'; /** * Deserialize HTML element to a valid document fragment. @@ -20,7 +21,12 @@ export const deserializeHtml = ( ): EDescendant[] => { // for serializer if (typeof element === 'string') { - element = htmlStringToDOMNode(element, stripWhitespace); + element = htmlStringToDOMNode(element); + } + + if (stripWhitespace) { + // TODO FIXME + stripWhitespaceFunction(element); } const fragment = deserializeHtmlElement(editor, element) as EDescendant[]; diff --git a/packages/core/src/plugins/html-deserializer/utils/htmlBodyToFragment.ts b/packages/core/src/plugins/html-deserializer/utils/htmlBodyToFragment.ts index 8a6ee98821..0bbdab4f80 100644 --- a/packages/core/src/plugins/html-deserializer/utils/htmlBodyToFragment.ts +++ b/packages/core/src/plugins/html-deserializer/utils/htmlBodyToFragment.ts @@ -11,13 +11,14 @@ jsx; */ export const htmlBodyToFragment = ( editor: PlateEditor, - element: HTMLElement + element: HTMLElement, + stripWhitespace = true ): EDescendant[] | undefined => { if (element.nodeName === 'BODY') { return jsx( 'fragment', {}, - deserializeHtmlNodeChildren(editor, element) + deserializeHtmlNodeChildren(editor, element, stripWhitespace) ) as EDescendant[]; } }; diff --git a/packages/core/src/plugins/html-deserializer/utils/htmlElementToElement.ts b/packages/core/src/plugins/html-deserializer/utils/htmlElementToElement.ts index f5848c585c..571df45465 100644 --- a/packages/core/src/plugins/html-deserializer/utils/htmlElementToElement.ts +++ b/packages/core/src/plugins/html-deserializer/utils/htmlElementToElement.ts @@ -10,7 +10,8 @@ import { pipeDeserializeHtmlElement } from './pipeDeserializeHtmlElement'; */ export const htmlElementToElement = ( editor: PlateEditor, - element: HTMLElement + element: HTMLElement, + stripWhitespace = true ) => { const deserialized = pipeDeserializeHtmlElement(editor, element); @@ -19,7 +20,11 @@ export const htmlElementToElement = ( let descendants = node.children ?? - (deserializeHtmlNodeChildren(editor, element) as TDescendant[]); + (deserializeHtmlNodeChildren( + editor, + element, + stripWhitespace + ) as TDescendant[]); if (descendants.length === 0 || withoutChildren) { descendants = [{ text: '' }]; } diff --git a/packages/core/src/plugins/html-deserializer/utils/htmlStringToDOMNode.ts b/packages/core/src/plugins/html-deserializer/utils/htmlStringToDOMNode.ts index bb386b5b04..bb7fdc498c 100644 --- a/packages/core/src/plugins/html-deserializer/utils/htmlStringToDOMNode.ts +++ b/packages/core/src/plugins/html-deserializer/utils/htmlStringToDOMNode.ts @@ -1,16 +1,9 @@ /** * Convert HTML string into HTML element. */ -export const htmlStringToDOMNode = ( - rawHtml: string, - stripWhitespace = true -) => { +export const htmlStringToDOMNode = (rawHtml: string) => { const node = document.createElement('body'); node.innerHTML = rawHtml; - if (stripWhitespace) { - node.innerHTML = node.innerHTML.replaceAll(/(\r\n|[\t\n\r])/g, ''); - } - return node; }; diff --git a/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.spec.ts b/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.spec.ts index 72eb29ec71..31bfc36a59 100644 --- a/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.spec.ts +++ b/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.spec.ts @@ -22,7 +22,7 @@ describe('htmlTextNodeToString', () => { describe('when text node with no characters except \n', () => { it('should be null', () => { const input = document.createTextNode('\n\n\n\n\n'); - const output = null; + const output = '\n\n\n\n\n'; expect(htmlTextNodeToString(input)).toEqual(output); }); @@ -31,7 +31,7 @@ describe('htmlTextNodeToString', () => { describe('when text node with text and \n characters', () => { it('should strip \n characters from start and end', () => { const input = document.createTextNode('\n\n\ntest\n\ntest\n\n'); - const output = 'test\n\ntest'; + const output = '\n\n\ntest\n\ntest\n\n'; expect(htmlTextNodeToString(input)).toEqual(output); }); diff --git a/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.ts b/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.ts index 7ab810a2f3..0123d8063d 100644 --- a/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.ts +++ b/packages/core/src/plugins/html-deserializer/utils/htmlTextNodeToString.ts @@ -5,7 +5,7 @@ import { isHtmlText } from './isHtmlText'; export const htmlTextNodeToString = (node: HTMLElement | ChildNode) => { if (isHtmlText(node)) { - const trimmedText = node.textContent?.replace(/^\n+|\n+$/g, '') ?? ''; + const trimmedText = node.textContent ?? ''; return trimmedText.length > 0 ? trimmedText : null; } }; diff --git a/packages/core/src/plugins/html-deserializer/utils/stripWhitespace.ts b/packages/core/src/plugins/html-deserializer/utils/stripWhitespace.ts new file mode 100644 index 0000000000..4a403b4474 --- /dev/null +++ b/packages/core/src/plugins/html-deserializer/utils/stripWhitespace.ts @@ -0,0 +1,265 @@ +import { isHtmlText } from './isHtmlText'; + +const allWhiteSpaceRegExp = /[\t\n\r ]+/g; +const startWhiteSpaceRegExp = /^[\t\n\r ]+/; +const endWhiteSpaceRegExp = /[\t\n\r ]+$/; + +// https://github.com/udecode/plate/pull/2718#discussion_r1375418430 +// Strip exactly one \n from the start of the text node +const oneLineFromStartRegExp = /^\n(?!\n)/g; +// Strip exactly one \n from the end of the text node +const oneLineFromEndRegExp = /(? node.nodeName?.toLowerCase(); + +const setNodeText = ( + node: Node, + callback: ((val: string) => string) | string +) => { + if (node.textContent) { + node.textContent = + typeof callback === 'function' ? callback(node.textContent) : callback; + } +}; + +const isPreElements = (node: Node) => preElements.has(getNodeName(node)); + +const isTextInLineElements = (node: Node) => + textInLineElements.has(getNodeName(node)); + +const walk = (node: Node) => { + if (node.firstChild) { + return node.firstChild; + } + + let nodeNext = node.nextSibling; + if (nodeNext) { + return nodeNext; + } + + for ( + let parent = node.parentNode; + Boolean(parent); + parent = parent?.parentNode || null + ) { + nodeNext = parent?.nextSibling || null; + + if (nodeNext) { + return nodeNext; + } + } +}; + +const getWhitespaceParent = (node: HTMLElement): [boolean, string, string] => { + let tempNode = node.parentNode; + while (tempNode != null) { + const nodeName = tempNode.nodeName?.toLowerCase(); + + if (tempNode.nodeType === Node.ELEMENT_NODE) { + const style = (tempNode as HTMLElement).style; + + if (style.whiteSpace && style.whiteSpace !== 'inherit') { + return [true, style.whiteSpace, nodeName]; + } + } + + if (isPreElements(tempNode)) { + return [true, '', nodeName]; + } else { + tempNode = tempNode.parentNode; + } + } + return [false, '', '']; +}; + +function nextSiblingWithSpace(node: Node) { + if (node.parentNode == null) { + return true; + } + + if (!textInLineElements.has(getNodeName(node.parentNode))) { + return true; + } + + if (node.parentNode.nextSibling == null) { + return true; + } + + if (node.parentNode.nextSibling.nodeType === Node.TEXT_NODE) { + return false; + } + + if (isTextInLineElements(node.parentNode.nextSibling)) { + return false; + } + + return true; +} + +function stripWhitespaceNormal( + node: HTMLElement, + preNodeEndsWithSpace: boolean +) { + let text = node.textContent ?? ''; + text = text.replaceAll(allWhiteSpaceRegExp, ' '); + + const shouldLeftTrim = + !node.previousSibling || + (node.previousSibling.nodeType === Node.ELEMENT_NODE && + node.previousSibling.nodeName === 'BR') || + preNodeEndsWithSpace; + + const shouldRightTrim = node.nextSibling ? false : nextSiblingWithSpace(node); + + if (shouldLeftTrim) { + text = text.replace(startWhiteSpaceRegExp, ''); + } + + if (shouldRightTrim) { + text = text.replace(endWhiteSpaceRegExp, ''); + } + + setNodeText(node, text); +} + +function stripWhitespaceByStyle( + node: HTMLElement, + whiteSpaceStyle: string, + preNodeEndsWithSpace: boolean +) { + switch (whiteSpaceStyle) { + case 'unset': + case 'initial': // Browser's default styles. + case 'normal': + case 'nowrap': { + stripWhitespaceNormal(node, preNodeEndsWithSpace); + break; + } + // For white-space: pre or pre-line: + // Do not strip any \n from the start of the text node + // Strip exactly one \n from the end of the text node + case 'pre-line': { + setNodeText(node, (val) => + val.replaceAll(/[\t ]+/g, ' ').replaceAll(oneLineFromEndRegExp, '') + ); + break; + } + case 'pre': { + setNodeText(node, (val) => val.replaceAll(oneLineFromEndRegExp, '')); + break; + } + // "revert" and "revert-layer" are expected to be supported in the future. + // eslint-disable-next-line unicorn/no-useless-switch-case + case 'break-spaces': + // eslint-disable-next-line unicorn/no-useless-switch-case + case 'pre-wrap': + // eslint-disable-next-line unicorn/no-useless-switch-case + case 'revert': + // eslint-disable-next-line unicorn/no-useless-switch-case + case 'revert-layer': + default: { + break; + } + } +} + +function stripWhitespaceByNodeName(node: HTMLElement, nodeName: string) { + switch (nodeName) { + case 'pre': { + setNodeText(node, (val) => + val + .replaceAll(oneLineFromStartRegExp, '') + .replaceAll(oneLineFromEndRegExp, '') + ); + break; + } + default: { + break; + } + } +} + +// TODO: FXI packages\serializer-docx\src\deserializer\__tests__ +// There are issues when importing DOCX documents. +// Handling DOCX files is completely different from handling web pages +// Handling of display: inline and inline-block is pending +export function stripWhitespace(root: HTMLElement) { + let nodeEndsWithSpace: boolean = false; + let preNodeEndsWithSpace: boolean = false; + + const preprocess = (node: HTMLElement | null) => { + if (node == null) { + return; + } + + if (isHtmlText(node)) { + nodeEndsWithSpace = + Boolean(node.textContent) && + /[^\S\u00A0]/.test( + node.textContent?.charAt(node.textContent?.length - 1) || '' + ); + + const [isWhiteSpaceParent, whitespaceStyle, nodeName] = + getWhitespaceParent(node); + + if (isWhiteSpaceParent) { + whitespaceStyle && + stripWhitespaceByStyle(node, whitespaceStyle, preNodeEndsWithSpace); + stripWhitespaceByNodeName(node, nodeName); + } else { + stripWhitespaceNormal(node, preNodeEndsWithSpace); + } + + preNodeEndsWithSpace = nodeEndsWithSpace; + } else { + nodeEndsWithSpace = false; + } + }; + + for ( + let node = root, lastNode = node; + node; + lastNode = node, node = walk(node) as HTMLElement + ) { + const tempNode = node; + + if (tempNode.parentNode == null && tempNode !== root) { + node = lastNode; + } + + preprocess(tempNode); + } +}