From ac08f2efd6e541f3393a2c83cdd95f22df8965bf Mon Sep 17 00:00:00 2001 From: Felix Feng Date: Wed, 30 Oct 2024 19:14:19 +0800 Subject: [PATCH] fix --- .changeset/large-items-remember.md | 5 + .../react/copilot/utils/getNextWord.spec.ts | 108 ++++++++++++++++++ .../ai/src/react/copilot/utils/getNextWord.ts | 34 ++++-- 3 files changed, 139 insertions(+), 8 deletions(-) create mode 100644 .changeset/large-items-remember.md create mode 100644 packages/ai/src/react/copilot/utils/getNextWord.spec.ts diff --git a/.changeset/large-items-remember.md b/.changeset/large-items-remember.md new file mode 100644 index 0000000000..1161200d43 --- /dev/null +++ b/.changeset/large-items-remember.md @@ -0,0 +1,5 @@ +--- +'@udecode/plate-ai': patch +--- + +Copilot: `getNextWord` when handle the case with mixed Chinese and English text. diff --git a/packages/ai/src/react/copilot/utils/getNextWord.spec.ts b/packages/ai/src/react/copilot/utils/getNextWord.spec.ts new file mode 100644 index 0000000000..6192c02108 --- /dev/null +++ b/packages/ai/src/react/copilot/utils/getNextWord.spec.ts @@ -0,0 +1,108 @@ +import { getNextWord } from './getNextWord'; + +describe('getNextWord', () => { + describe('English text', () => { + it('should get first word with no spaces', () => { + expect(getNextWord({ text: 'hello world' })).toEqual({ + firstWord: 'hello', + remainingText: ' world', + }); + }); + + it('should handle leading spaces', () => { + expect(getNextWord({ text: ' hello world' })).toEqual({ + firstWord: ' hello', + remainingText: ' world', + }); + }); + + it('should handle single word', () => { + expect(getNextWord({ text: 'hello' })).toEqual({ + firstWord: 'hello', + remainingText: '', + }); + }); + }); + + describe('CJK characters', () => { + it('should handle Chinese characters', () => { + expect(getNextWord({ text: '你好 世界' })).toEqual({ + firstWord: '你', + remainingText: '好 世界', + }); + }); + + it('should handle Chinese character followed by punctuation', () => { + expect(getNextWord({ text: '你。好 世界' })).toEqual({ + firstWord: '你。', + remainingText: '好 世界', + }); + }); + + it('should handle various CJK punctuation marks', () => { + expect(getNextWord({ text: '你、好 世界' })).toEqual({ + firstWord: '你、', + remainingText: '好 世界', + }); + + expect(getNextWord({ text: '你!世界' })).toEqual({ + firstWord: '你!', + remainingText: '世界', + }); + + expect(getNextWord({ text: '你?好' })).toEqual({ + firstWord: '你?', + remainingText: '好', + }); + + expect(getNextWord({ text: 'hello? world' })).toEqual({ + firstWord: 'hello?', + remainingText: ' world', + }); + }); + + it('should handle Japanese Hiragana', () => { + expect(getNextWord({ text: 'こんにちは 世界' })).toEqual({ + firstWord: 'こ', + remainingText: 'んにちは 世界', + }); + }); + + it('should handle Korean characters', () => { + expect(getNextWord({ text: '안녕하세요 세계' })).toEqual({ + firstWord: '안', + remainingText: '녕하세요 세계', + }); + }); + + it('should handle CJK with leading spaces', () => { + expect(getNextWord({ text: ' 你好 世界' })).toEqual({ + firstWord: ' 你', + remainingText: '好 世界', + }); + }); + }); + + describe('mixed content', () => { + it('should handle mix of English and CJK', () => { + expect(getNextWord({ text: 'hello 你好' })).toEqual({ + firstWord: 'hello', + remainingText: ' 你好', + }); + }); + + it('should handle English words directly adjacent to Chinese characters', () => { + expect(getNextWord({ text: 'React是nice框架' })).toEqual({ + firstWord: 'React', + remainingText: '是nice框架', + }); + }); + + it('should handle CJK followed by English', () => { + expect(getNextWord({ text: '你 hello' })).toEqual({ + firstWord: '你', + remainingText: ' hello', + }); + }); + }); +}); diff --git a/packages/ai/src/react/copilot/utils/getNextWord.ts b/packages/ai/src/react/copilot/utils/getNextWord.ts index f3cf35485d..610369b293 100644 --- a/packages/ai/src/react/copilot/utils/getNextWord.ts +++ b/packages/ai/src/react/copilot/utils/getNextWord.ts @@ -30,18 +30,36 @@ export const getNextWord: GetNextWord = ({ text }) => { let firstWord, remainingText; if (isCJKChar) { - // CJK characters: match leading spaces + first character + trailing spaces + // CJK characters: match leading spaces + first character + optional punctuation const match = - // eslint-disable-next-line regexp/no-unused-capturing-group - /^(\s*[\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]\s*)/.exec( + /^(\s*)([\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF])([\u3000-\u303F\uFF00-\uFFEF])?/.exec( text ); - firstWord = match?.[0] || ''; - remainingText = text.slice(firstWord.length); + + if (match) { + // eslint-disable-next-line @typescript-eslint/no-unused-vars + const [fullMatch, spaces = '', char = '', punctuation = ''] = match; + firstWord = spaces + char + punctuation; + remainingText = text.slice(firstWord.length); + } else { + firstWord = ''; + remainingText = text; + } } else { - // Other characters (e.g., English): use space-based word separation - firstWord = /^\s*\S+/.exec(text)?.[0] || ''; - remainingText = text.slice(firstWord.length); + // For non-CJK text (including mixed content), match until space or CJK char + const match = + // eslint-disable-next-line regexp/no-unused-capturing-group + /^(\s*\S+?)(?=[\s\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]|$)/.exec( + text + ); + + if (match) { + firstWord = match[0]; + remainingText = text.slice(firstWord.length); + } else { + firstWord = text; + remainingText = ''; + } } return { firstWord, remainingText };