Skip to content

Commit d05259d

Browse files
committed
perf: retry to load image;perf: default index check (#4004)
* perf: retry to load image * perf: default index check
1 parent 8980664 commit d05259d

File tree

9 files changed

+111
-80
lines changed

9 files changed

+111
-80
lines changed

docSite/content/zh-cn/docs/development/openapi/dataset.md

+14-7
Original file line numberDiff line numberDiff line change
@@ -1063,10 +1063,12 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/collect
10631063

10641064
| 字段 | 类型 | 说明 | 必填 |
10651065
| --- | --- | --- | --- |
1066-
| defaultIndex | Boolean | 是否为默认索引 | |
1067-
| dataId | String | 关联的向量ID | |
1066+
| type | String | 可选索引类型:default-默认索引; custom-自定义索引; summary-总结索引; question-问题索引; image-图片索引 | |
1067+
| dataId | String | 关联的向量ID,变更数据时候传入该 ID,会进行差量更新,而不是全量更新 | |
10681068
| text | String | 文本内容 ||
10691069

1070+
`type` 不填则默认为 `custom` 索引,还会基于 q/a 组成一个默认索引。如果传入了默认索引,则不会额外创建。
1071+
10701072
### 为集合批量添加添加数据
10711073

10721074
注意,每次最多推送 200 组数据。
@@ -1298,8 +1300,7 @@ curl --location --request GET 'http://localhost:3000/api/core/dataset/data/detai
12981300
"chunkIndex": 0,
12991301
"indexes": [
13001302
{
1301-
"defaultIndex": true,
1302-
"type": "chunk",
1303+
"type": "default",
13031304
"dataId": "3720083",
13041305
"text": "N o . 2 0 2 2 1 2中 国 信 息 通 信 研 究 院京东探索研究院2022年 9月人工智能生成内容(AIGC)白皮书(2022 年)版权声明本白皮书版权属于中国信息通信研究院和京东探索研究院,并受法律保护。转载、摘编或利用其它方式使用本白皮书文字或者观点的,应注明“来源:中国信息通信研究院和京东探索研究院”。违反上述声明者,编者将追究其相关法律责任。前 言习近平总书记曾指出,“数字技术正以新理念、新业态、新模式全面融入人类经济、政治、文化、社会、生态文明建设各领域和全过程”。在当前数字世界和物理世界加速融合的大背景下,人工智能生成内容(Artificial Intelligence Generated Content,简称 AIGC)正在悄然引导着一场深刻的变革,重塑甚至颠覆数字内容的生产方式和消费模式,将极大地丰富人们的数字生活,是未来全面迈向数字文明新时代不可或缺的支撑力量。",
13051306
"_id": "65abd4b29d1448617cba61dc"
@@ -1334,13 +1335,19 @@ curl --location --request PUT 'http://localhost:3000/api/core/dataset/data/updat
13341335
"q":"测试111",
13351336
"a":"sss",
13361337
"indexes":[
1338+
{
1339+
"dataId": "xxxx",
1340+
"type": "default",
1341+
"text": "默认索引"
1342+
},
13371343
{
13381344
"dataId": "xxx",
1339-
"defaultIndex":false,
1340-
"text":"自定义索引1"
1345+
"type": "custom",
1346+
"text": "旧的自定义索引1"
13411347
},
13421348
{
1343-
"text":"修改后的自定义索引2。(会删除原来的自定义索引2,并插入新的自定义索引2)"
1349+
"type":"custom",
1350+
"text":"新增的自定义索引"
13441351
}
13451352
]
13461353
}'

packages/global/common/string/markdown.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ export const markdownProcess = async ({
168168
return simpleMarkdownText(imageProcess);
169169
};
170170

171-
export const matchMdImgTextAndUpload = (text: string) => {
171+
export const matchMdImg = (text: string) => {
172172
const base64Regex = /!\[([^\]]*)\]\((data:image\/[^;]+;base64[^)]+)\)/g;
173173
const imageList: ImageType[] = [];
174174

packages/service/common/file/image/controller.ts

+10-7
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { guessBase64ImageType } from '../utils';
66
import { readFromSecondary } from '../../mongo/utils';
77
import { addHours } from 'date-fns';
88
import { imageFileType } from '@fastgpt/global/common/file/constants';
9+
import { retryFn } from '@fastgpt/global/common/system/utils';
910

1011
export const maxImgSize = 1024 * 1024 * 12;
1112
const base64MimeRegex = /data:image\/([^\)]+);base64/;
@@ -40,13 +41,15 @@ export async function uploadMongoImg({
4041
return Promise.reject(`Invalid image file type: ${mime}`);
4142
}
4243

43-
const { _id } = await MongoImage.create({
44-
teamId,
45-
binary,
46-
metadata: Object.assign({ mime }, metadata),
47-
shareId,
48-
expiredTime: forever ? undefined : addHours(new Date(), 1)
49-
});
44+
const { _id } = await retryFn(() =>
45+
MongoImage.create({
46+
teamId,
47+
binary,
48+
metadata: Object.assign({ mime }, metadata),
49+
shareId,
50+
expiredTime: forever ? undefined : addHours(new Date(), 1)
51+
})
52+
);
5053

5154
return `${process.env.NEXT_PUBLIC_BASE_URL || ''}${imageBaseUrl}${String(_id)}.${extension}`;
5255
}

packages/service/common/file/image/utils.ts

+13-6
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,30 @@ import axios from 'axios';
22
import { addLog } from '../../system/log';
33
import { serverRequestBaseUrl } from '../../api/serverRequest';
44
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils';
5+
import { retryFn } from '@fastgpt/global/common/system/utils';
56

67
export const getImageBase64 = async (url: string) => {
78
addLog.debug(`Load image to base64: ${url}`);
89

910
try {
10-
const response = await axios.get(url, {
11-
baseURL: serverRequestBaseUrl,
12-
responseType: 'arraybuffer',
13-
proxy: false
14-
});
11+
const response = await retryFn(() =>
12+
axios.get(url, {
13+
baseURL: serverRequestBaseUrl,
14+
responseType: 'arraybuffer',
15+
proxy: false
16+
})
17+
);
1518

1619
const base64 = Buffer.from(response.data, 'binary').toString('base64');
1720
const imageType =
1821
getFileContentTypeFromHeader(response.headers['content-type']) ||
1922
guessBase64ImageType(base64);
2023

21-
return `data:${imageType};base64,${base64}`;
24+
return {
25+
completeBase64: `data:${imageType};base64,${base64}`,
26+
base64,
27+
mime: imageType
28+
};
2229
} catch (error) {
2330
addLog.debug(`Load image to base64 failed: ${url}`);
2431
console.log(error);

packages/service/common/file/read/utils.ts

+15-12
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@ import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type'
66
import axios from 'axios';
77
import { addLog } from '../../system/log';
88
import { batchRun } from '@fastgpt/global/common/system/utils';
9-
import { htmlTable2Md, matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
9+
import { htmlTable2Md, matchMdImg } from '@fastgpt/global/common/string/markdown';
1010
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
1111
import { getErrText } from '@fastgpt/global/common/error/utils';
1212
import { delay } from '@fastgpt/global/common/system/utils';
1313
import { getNanoid } from '@fastgpt/global/common/string/tools';
14+
import { getImageBase64 } from '../image/utils';
1415

1516
export type readRawTextByLocalFileParams = {
1617
teamId: string;
@@ -99,7 +100,7 @@ export const readRawContentByFileBuffer = async ({
99100
addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`);
100101

101102
const rawText = response.markdown;
102-
const { text, imageList } = matchMdImgTextAndUpload(rawText);
103+
const { text, imageList } = matchMdImg(rawText);
103104

104105
createPdfParseUsage({
105106
teamId,
@@ -120,31 +121,33 @@ export const readRawContentByFileBuffer = async ({
120121
const parseTextImage = async (text: string) => {
121122
// Extract image links and convert to base64
122123
const imageList: { id: string; url: string }[] = [];
123-
const processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
124-
const id = getNanoid();
124+
let processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
125+
const id = `IMAGE_${getNanoid()}_IMAGE`;
125126
imageList.push({
126127
id,
127128
url
128129
});
129130
return `![](${id})`;
130131
});
131132

133+
// Get base64 from image url
132134
let resultImageList: ImageType[] = [];
133-
await Promise.all(
134-
imageList.map(async (item) => {
135+
await batchRun(
136+
imageList,
137+
async (item) => {
135138
try {
136-
const response = await axios.get(item.url, { responseType: 'arraybuffer' });
137-
const mime = response.headers['content-type'] || 'image/jpeg';
138-
const base64 = response.data.toString('base64');
139+
const { base64, mime } = await getImageBase64(item.url);
139140
resultImageList.push({
140141
uuid: item.id,
141142
mime,
142143
base64
143144
});
144145
} catch (error) {
146+
processedText = processedText.replace(item.id, item.url);
145147
addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`);
146148
}
147-
})
149+
},
150+
5
148151
);
149152

150153
return {
@@ -312,14 +315,14 @@ export const readRawContentByFileBuffer = async ({
312315
return await uploadMongoImg({
313316
base64Img: `data:${item.mime};base64,${item.base64}`,
314317
teamId,
315-
// expiredTime: addHours(new Date(), 1),
316318
metadata: {
317319
...metadata,
318320
mime: item.mime
319321
}
320322
});
321323
} catch (error) {
322-
return '';
324+
addLog.warn('Upload file image error', { error });
325+
return 'Upload load image error';
323326
}
324327
})();
325328
rawText = rawText.replace(item.uuid, src);

packages/service/core/chat/utils.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ export const loadRequestMessages = async ({
165165
try {
166166
// If imgUrl is a local path, load image from local, and set url to base64
167167
if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') {
168-
const base64 = await getImageBase64(imgUrl);
168+
const { completeBase64: base64 } = await getImageBase64(imgUrl);
169169

170170
return {
171171
...item,

packages/service/worker/htmlStr2Md/utils.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import TurndownService from 'turndown';
22
import { ImageType } from '../readFile/type';
3-
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
3+
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
44
import { getNanoid } from '@fastgpt/global/common/string/tools';
55
// @ts-ignore
66
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
@@ -46,7 +46,7 @@ export const html2md = (
4646
// Base64 img to id, otherwise it will occupy memory when going to md
4747
const { processedHtml, images } = processBase64Images(html);
4848
const md = turndownService.turndown(processedHtml);
49-
const { text, imageList } = matchMdImgTextAndUpload(md);
49+
const { text, imageList } = matchMdImg(md);
5050

5151
return {
5252
rawText: text,

projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx

+28-36
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react';
2-
import { Box, Flex, Button, Textarea, useTheme } from '@chakra-ui/react';
2+
import { Box, Flex, Button, Textarea } from '@chakra-ui/react';
33
import {
44
FieldArrayWithId,
55
UseFieldArrayRemove,
@@ -19,8 +19,7 @@ import MyModal from '@fastgpt/web/components/common/MyModal';
1919
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
2020
import { useQuery } from '@tanstack/react-query';
2121
import { useTranslation } from 'next-i18next';
22-
import { useRequest, useRequest2 } from '@fastgpt/web/hooks/useRequest';
23-
import { useConfirm } from '@fastgpt/web/hooks/useConfirm';
22+
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
2423
import { getSourceNameIcon } from '@fastgpt/global/core/dataset/utils';
2524
import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type';
2625
import DeleteIcon from '@fastgpt/web/components/common/Icon/delete';
@@ -30,10 +29,12 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
3029
import { getErrText } from '@fastgpt/global/common/error/utils';
3130
import { useSystemStore } from '@/web/common/system/useSystemStore';
3231
import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
33-
import { useSystem } from '@fastgpt/web/hooks/useSystem';
3432
import LightRowTabs from '@fastgpt/web/components/common/Tabs/LightRowTabs';
3533
import styles from './styles.module.scss';
36-
import { getDatasetIndexMapData } from '@fastgpt/global/core/dataset/data/constants';
34+
import {
35+
DatasetDataIndexTypeEnum,
36+
getDatasetIndexMapData
37+
} from '@fastgpt/global/core/dataset/data/constants';
3738

3839
export type InputDataType = {
3940
q: string;
@@ -62,11 +63,10 @@ const InputDataModal = ({
6263
onSuccess: (data: InputDataType & { dataId: string }) => void;
6364
}) => {
6465
const { t } = useTranslation();
65-
const theme = useTheme();
6666
const { toast } = useToast();
6767
const [currentTab, setCurrentTab] = useState(TabEnum.content);
6868
const { embeddingModelList, defaultModels } = useSystemStore();
69-
const { isPc } = useSystem();
69+
7070
const { register, handleSubmit, reset, control } = useForm<InputDataType>();
7171
const {
7272
fields: indexes,
@@ -112,11 +112,6 @@ const InputDataModal = ({
112112
}
113113
];
114114

115-
const { ConfirmModal, openConfirm } = useConfirm({
116-
content: t('common:dataset.data.Delete Tip'),
117-
type: 'delete'
118-
});
119-
120115
const { data: collection = defaultCollectionDetail } = useQuery(
121116
['loadCollectionId', collectionId],
122117
() => {
@@ -163,8 +158,8 @@ const InputDataModal = ({
163158
}, [collection.dataset.vectorModel, defaultModels.embedding, embeddingModelList]);
164159

165160
// import new data
166-
const { mutate: sureImportData, isLoading: isImporting } = useRequest({
167-
mutationFn: async (e: InputDataType) => {
161+
const { runAsync: sureImportData, loading: isImporting } = useRequest2(
162+
async (e: InputDataType) => {
168163
if (!e.q) {
169164
setCurrentTab(TabEnum.content);
170165
return Promise.reject(t('common:dataset.data.input is empty'));
@@ -181,31 +176,29 @@ const InputDataModal = ({
181176
collectionId: collection._id,
182177
q: e.q,
183178
a: e.a,
184-
// remove dataId
185-
indexes:
186-
e.indexes?.map((index) => ({
187-
...index,
188-
dataId: undefined
189-
})) || []
179+
// Contains no default index
180+
indexes: e.indexes
190181
});
191182

192183
return {
193184
...data,
194185
dataId
195186
};
196187
},
197-
successToast: t('common:dataset.data.Input Success Tip'),
198-
onSuccess(e) {
199-
reset({
200-
...e,
201-
q: '',
202-
a: '',
203-
indexes: []
204-
});
205-
onSuccess(e);
206-
},
207-
errorToast: t('common:common.error.unKnow')
208-
});
188+
{
189+
successToast: t('common:dataset.data.Input Success Tip'),
190+
onSuccess(e) {
191+
reset({
192+
...e,
193+
q: '',
194+
a: '',
195+
indexes: []
196+
});
197+
onSuccess(e);
198+
},
199+
errorToast: t('common:common.error.unKnow')
200+
}
201+
);
209202

210203
// update
211204
const { runAsync: onUpdateData, loading: isUpdating } = useRequest2(
@@ -239,6 +232,7 @@ const InputDataModal = ({
239232
() => getSourceNameIcon({ sourceName: collection.sourceName, sourceId: collection.sourceId }),
240233
[collection]
241234
);
235+
242236
return (
243237
<MyModal
244238
isOpen={true}
@@ -291,9 +285,8 @@ const InputDataModal = ({
291285
p={0}
292286
onClick={() =>
293287
appendIndexes({
294-
type: 'custom',
295-
text: '',
296-
dataId: `${Date.now()}`
288+
type: DatasetDataIndexTypeEnum.custom,
289+
text: ''
297290
})
298291
}
299292
>
@@ -331,7 +324,6 @@ const InputDataModal = ({
331324
</MyTooltip>
332325
</Flex>
333326
</MyBox>
334-
<ConfirmModal />
335327
</MyModal>
336328
);
337329
};

0 commit comments

Comments
 (0)