Skip to content

Commit

Permalink
Merge pull request #3690 from udecode/fixai
Browse files Browse the repository at this point in the history
Fix the copilot issues with mixed Chinese and English text.
  • Loading branch information
felixfeng33 authored Oct 30, 2024
2 parents e17e584 + ac08f2e commit 61d9e93
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 8 deletions.
5 changes: 5 additions & 0 deletions .changeset/large-items-remember.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@udecode/plate-ai': patch
---

Copilot: `getNextWord` when handle the case with mixed Chinese and English text.
108 changes: 108 additions & 0 deletions packages/ai/src/react/copilot/utils/getNextWord.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import { getNextWord } from './getNextWord';

describe('getNextWord', () => {
describe('English text', () => {
it('should get first word with no spaces', () => {
expect(getNextWord({ text: 'hello world' })).toEqual({
firstWord: 'hello',
remainingText: ' world',
});
});

it('should handle leading spaces', () => {
expect(getNextWord({ text: ' hello world' })).toEqual({
firstWord: ' hello',
remainingText: ' world',
});
});

it('should handle single word', () => {
expect(getNextWord({ text: 'hello' })).toEqual({
firstWord: 'hello',
remainingText: '',
});
});
});

describe('CJK characters', () => {
it('should handle Chinese characters', () => {
expect(getNextWord({ text: '你好 世界' })).toEqual({
firstWord: '你',
remainingText: '好 世界',
});
});

it('should handle Chinese character followed by punctuation', () => {
expect(getNextWord({ text: '你。好 世界' })).toEqual({
firstWord: '你。',
remainingText: '好 世界',
});
});

it('should handle various CJK punctuation marks', () => {
expect(getNextWord({ text: '你、好 世界' })).toEqual({
firstWord: '你、',
remainingText: '好 世界',
});

expect(getNextWord({ text: '你!世界' })).toEqual({
firstWord: '你!',
remainingText: '世界',
});

expect(getNextWord({ text: '你?好' })).toEqual({
firstWord: '你?',
remainingText: '好',
});

expect(getNextWord({ text: 'hello? world' })).toEqual({
firstWord: 'hello?',
remainingText: ' world',
});
});

it('should handle Japanese Hiragana', () => {
expect(getNextWord({ text: 'こんにちは 世界' })).toEqual({
firstWord: 'こ',
remainingText: 'んにちは 世界',
});
});

it('should handle Korean characters', () => {
expect(getNextWord({ text: '안녕하세요 세계' })).toEqual({
firstWord: '안',
remainingText: '녕하세요 세계',
});
});

it('should handle CJK with leading spaces', () => {
expect(getNextWord({ text: ' 你好 世界' })).toEqual({
firstWord: ' 你',
remainingText: '好 世界',
});
});
});

describe('mixed content', () => {
it('should handle mix of English and CJK', () => {
expect(getNextWord({ text: 'hello 你好' })).toEqual({
firstWord: 'hello',
remainingText: ' 你好',
});
});

it('should handle English words directly adjacent to Chinese characters', () => {
expect(getNextWord({ text: 'React是nice框架' })).toEqual({
firstWord: 'React',
remainingText: '是nice框架',
});
});

it('should handle CJK followed by English', () => {
expect(getNextWord({ text: '你 hello' })).toEqual({
firstWord: '你',
remainingText: ' hello',
});
});
});
});
34 changes: 26 additions & 8 deletions packages/ai/src/react/copilot/utils/getNextWord.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,36 @@ export const getNextWord: GetNextWord = ({ text }) => {
let firstWord, remainingText;

if (isCJKChar) {
// CJK characters: match leading spaces + first character + trailing spaces
// CJK characters: match leading spaces + first character + optional punctuation
const match =
// eslint-disable-next-line regexp/no-unused-capturing-group
/^(\s*[\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]\s*)/.exec(
/^(\s*)([\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF])([\u3000-\u303F\uFF00-\uFFEF])?/.exec(
text
);
firstWord = match?.[0] || '';
remainingText = text.slice(firstWord.length);

if (match) {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const [fullMatch, spaces = '', char = '', punctuation = ''] = match;
firstWord = spaces + char + punctuation;
remainingText = text.slice(firstWord.length);
} else {
firstWord = '';
remainingText = text;
}
} else {
// Other characters (e.g., English): use space-based word separation
firstWord = /^\s*\S+/.exec(text)?.[0] || '';
remainingText = text.slice(firstWord.length);
// For non-CJK text (including mixed content), match until space or CJK char
const match =
// eslint-disable-next-line regexp/no-unused-capturing-group
/^(\s*\S+?)(?=[\s\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]|$)/.exec(
text
);

if (match) {
firstWord = match[0];
remainingText = text.slice(firstWord.length);
} else {
firstWord = text;
remainingText = '';
}
}

return { firstWord, remainingText };
Expand Down

0 comments on commit 61d9e93

Please sign in to comment.