Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the copilot issues with mixed Chinese and English text. #3690

Merged
merged 1 commit into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/large-items-remember.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@udecode/plate-ai': patch
---

Copilot: `getNextWord` when handle the case with mixed Chinese and English text.
108 changes: 108 additions & 0 deletions packages/ai/src/react/copilot/utils/getNextWord.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import { getNextWord } from './getNextWord';

describe('getNextWord', () => {
describe('English text', () => {
it('should get first word with no spaces', () => {
expect(getNextWord({ text: 'hello world' })).toEqual({
firstWord: 'hello',
remainingText: ' world',
});
});

it('should handle leading spaces', () => {
expect(getNextWord({ text: ' hello world' })).toEqual({
firstWord: ' hello',
remainingText: ' world',
});
});

it('should handle single word', () => {
expect(getNextWord({ text: 'hello' })).toEqual({
firstWord: 'hello',
remainingText: '',
});
});
});

describe('CJK characters', () => {
it('should handle Chinese characters', () => {
expect(getNextWord({ text: '你好 世界' })).toEqual({
firstWord: '你',
remainingText: '好 世界',
});
});

it('should handle Chinese character followed by punctuation', () => {
expect(getNextWord({ text: '你。好 世界' })).toEqual({
firstWord: '你。',
remainingText: '好 世界',
});
});

it('should handle various CJK punctuation marks', () => {
expect(getNextWord({ text: '你、好 世界' })).toEqual({
firstWord: '你、',
remainingText: '好 世界',
});

expect(getNextWord({ text: '你!世界' })).toEqual({
firstWord: '你!',
remainingText: '世界',
});

expect(getNextWord({ text: '你?好' })).toEqual({
firstWord: '你?',
remainingText: '好',
});

expect(getNextWord({ text: 'hello? world' })).toEqual({
firstWord: 'hello?',
remainingText: ' world',
});
});

it('should handle Japanese Hiragana', () => {
expect(getNextWord({ text: 'こんにちは 世界' })).toEqual({
firstWord: 'こ',
remainingText: 'んにちは 世界',
});
});

it('should handle Korean characters', () => {
expect(getNextWord({ text: '안녕하세요 세계' })).toEqual({
firstWord: '안',
remainingText: '녕하세요 세계',
});
});

it('should handle CJK with leading spaces', () => {
expect(getNextWord({ text: ' 你好 世界' })).toEqual({
firstWord: ' 你',
remainingText: '好 世界',
});
});
});

describe('mixed content', () => {
it('should handle mix of English and CJK', () => {
expect(getNextWord({ text: 'hello 你好' })).toEqual({
firstWord: 'hello',
remainingText: ' 你好',
});
});

it('should handle English words directly adjacent to Chinese characters', () => {
expect(getNextWord({ text: 'React是nice框架' })).toEqual({
firstWord: 'React',
remainingText: '是nice框架',
});
});

it('should handle CJK followed by English', () => {
expect(getNextWord({ text: '你 hello' })).toEqual({
firstWord: '你',
remainingText: ' hello',
});
});
});
});
34 changes: 26 additions & 8 deletions packages/ai/src/react/copilot/utils/getNextWord.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,36 @@ export const getNextWord: GetNextWord = ({ text }) => {
let firstWord, remainingText;

if (isCJKChar) {
// CJK characters: match leading spaces + first character + trailing spaces
// CJK characters: match leading spaces + first character + optional punctuation
const match =
// eslint-disable-next-line regexp/no-unused-capturing-group
/^(\s*[\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]\s*)/.exec(
/^(\s*)([\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF])([\u3000-\u303F\uFF00-\uFFEF])?/.exec(
text
);
firstWord = match?.[0] || '';
remainingText = text.slice(firstWord.length);

if (match) {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const [fullMatch, spaces = '', char = '', punctuation = ''] = match;
firstWord = spaces + char + punctuation;
remainingText = text.slice(firstWord.length);
} else {
firstWord = '';
remainingText = text;
}
} else {
// Other characters (e.g., English): use space-based word separation
firstWord = /^\s*\S+/.exec(text)?.[0] || '';
remainingText = text.slice(firstWord.length);
// For non-CJK text (including mixed content), match until space or CJK char
const match =
// eslint-disable-next-line regexp/no-unused-capturing-group
/^(\s*\S+?)(?=[\s\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]|$)/.exec(
text
);

if (match) {
firstWord = match[0];
remainingText = text.slice(firstWord.length);
} else {
firstWord = text;
remainingText = '';
}
}

return { firstWord, remainingText };
Expand Down