2u0ta0
diff --git a/‎.npmrc
-1 b/‎.npmrc
-1
diff --git a/‎.vscode/nextapi.code-snippets
+1-1 b/‎.vscode/nextapi.code-snippets
+1-1
diff --git a/‎docSite/assets/imgs/external_file0.png
163 KB b/‎docSite/assets/imgs/external_file0.png
163 KB
diff --git a/‎docSite/assets/imgs/external_file1.png
122 KB b/‎docSite/assets/imgs/external_file1.png
122 KB
diff --git a/‎docSite/assets/imgs/external_file2.png
74.6 KB b/‎docSite/assets/imgs/external_file2.png
74.6 KB
diff --git a/‎docSite/content/docs/course/externalFile.md
+26 b/‎docSite/content/docs/course/externalFile.md
+26
diff --git a/‎docSite/content/docs/development/openapi/dataset.md
+82-2 b/‎docSite/content/docs/development/openapi/dataset.md
+82-2
diff --git a/‎docSite/content/docs/development/upgrading/481.md
+8-5 b/‎docSite/content/docs/development/upgrading/481.md
+8-5
diff --git a/‎packages/global/core/dataset/api.d.ts
+15-1 b/‎packages/global/core/dataset/api.d.ts
+15-1
diff --git a/‎packages/global/core/dataset/collection/constants.ts
+1-1 b/‎packages/global/core/dataset/collection/constants.ts
+1-1
diff --git a/‎packages/global/core/dataset/collection/utils.ts
+14 b/‎packages/global/core/dataset/collection/utils.ts
+14
diff --git a/‎packages/global/core/dataset/constants.ts
+7-2 b/‎packages/global/core/dataset/constants.ts
+7-2
diff --git a/‎packages/global/core/dataset/read.ts
-2 b/‎packages/global/core/dataset/read.ts
-2
diff --git a/‎packages/global/core/dataset/type.d.ts
+5-3 b/‎packages/global/core/dataset/type.d.ts
+5-3
diff --git a/‎packages/global/core/dataset/utils.ts
+5-5 b/‎packages/global/core/dataset/utils.ts
+5-5
diff --git a/‎packages/global/package.json
+1-1 b/‎packages/global/package.json
+1-1
diff --git a/‎packages/service/common/file/gridfs/controller.ts
+2-2 b/‎packages/service/common/file/gridfs/controller.ts
+2-2
@@ -1,2 +1 @@
 public-hoist-pattern[]=*tiktoken*
-public-hoist-pattern[]=*react*
@@ -11,7 +11,7 @@
 		"prefix": "nextapi",
 		"body": [
 			"import type { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next';",
-			"import { NextAPI } from '@/service/middle/entry';",
+			"import { NextAPI } from '@/service/middleware/entry';",
 			"",
 			"export type ${TM_FILENAME_BASE}Query = {};",
 			"",
 
@@ -0,0 +1,26 @@
+---
+title: '外部文件知识库'
+description: 'FastGPT 外部文件知识库功能介绍和使用方式'
+icon: 'language'
+draft: false
+toc: true
+weight: 107
+---
+
+外部文件库是 FastGPT 商业版特有功能。它允许接入你现在的文件系统，无需将文件再导入一份到 FastGPT 中。
+
+并且，阅读权限可以通过你的文件系统进行控制。
+
+|                       |                       | |
+| --------------------- | --------------------- | --------------------- |
+| ![](/imgs/external_file0.png) | ![](/imgs/external_file1.png) | ![](/imgs/external_file2.png) | 
+
+
+## 导入参数说明
+
+- 外部预览地址：用于跳转你的文件阅读地址，会携带“文件阅读ID”进行访问。
+- 文件访问URL：文件可访问的地址。
+- 文件阅读ID：通常情况下，文件访问URL是临时的。如果希望永久可以访问，你需要使用该文件阅读ID，并配合上“外部预览地址”，跳转至新的阅读地址进行原文件访问。
+- 文件名：默认会自动解析文件访问URL上的文件名。如果你手动填写，将会以手动填写的值为准。
+
+[点击查看API导入文档](/docs/development/openapi/dataset/#创建一个外部文件库集合商业版)
@@ -295,6 +295,24 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete?
 
 ## 集合
 
+### 通用创建参数说明
+
+**入参**
+
+| 参数 | 说明 | 必填 |
+| --- | --- | --- |
+| datasetId | 知识库ID | ✅ |
+| parentId： | 父级ID，不填则默认为根目录 |  |
+| trainingType | 训练模式。chunk: 按文本长度进行分割;qa: QA拆分;auto: 增强训练 | ✅ |
+| chunkSize | 预估块大小 |  |
+| chunkSplitter | 自定义最高优先分割符号 |  |
+| qaPrompt | qa拆分提示词 |  |
+
+**出参**
+
+- collectionId - 新建的集合ID  
+- insertLen：插入的块数量
+
 ### 创建一个空的集合
 
 {{< tabs tabTotal="3" >}}
@@ -500,7 +518,7 @@ data 为集合的 ID。
 {{< /tab >}}
 {{< /tabs >}}
 
-### 创建一个文件集合(商业版)
+### 创建一个文件集合
 
 传入一个文件，创建一个集合，会读取文件内容进行分割。目前支持：pdf, docx, md, txt, html, csv。
 
@@ -509,7 +527,7 @@ data 为集合的 ID。
 {{< markdownify >}}
 
 ```bash
-curl --location --request POST 'http://localhost:3000/api/proApi/core/dataset/collection/create/file' \
+curl --location --request POST 'http://localhost:3000/api/core/dataset/collection/create/localFile' \
 --header 'Authorization: Bearer {{authorization}}' \
 --form 'file=@"C:\\Users\\user\\Desktop\\fastgpt测试文件\\index.html"' \
 --form 'data="{\"datasetId\":\"6593e137231a2be9c5603ba7\",\"parentId\":null,\"trainingType\":\"chunk\",\"chunkSize\":512,\"chunkSplitter\":\"\",\"qaPrompt\":\"\",\"metadata\":{}}"'
@@ -565,6 +583,68 @@ data 为集合的 ID。
 {{< /tab >}}
 {{< /tabs >}}
 
+### 创建一个外部文件库集合（商业版）
+
+{{< tabs tabTotal="3" >}}
+{{< tab tabName="请求示例" >}}
+{{< markdownify >}}
+
+```bash
+curl --location --request POST 'http://localhost:3000/api/proApi/core/dataset/collection/create/externalFileUrl' \
+--header 'Authorization: Bearer {{authorization}}' \
+--header 'User-Agent: Apifox/1.0.0 (https://apifox.com)' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+    "externalFileUrl":"https://image.xxxxx.com/fastgpt-dev/%E6%91%82.pdf",
+    "externalFileId":"1111",
+    "filename":"自定义文件名",
+    "datasetId":"6642d105a5e9d2b00255b27b",
+    "parentId": null,
+
+    "trainingType": "chunk",
+    "chunkSize":512,
+    "chunkSplitter":"",
+    "qaPrompt":""
+}'
+```
+
+{{< /markdownify >}}
+{{< /tab >}}
+
+{{< tab tabName="参数说明" >}}
+{{< markdownify >}}
+
+| 参数 | 说明 | 必填 |
+| --- | --- | --- |
+| externalFileUrl | 文件访问链接（可以是临时链接） | ✅ |
+| externalFileId | 外部文件ID |  |
+| filename | 自定义文件名 |  |
+
+
+{{< /markdownify >}}
+{{< /tab >}}
+
+{{< tab tabName="响应示例" >}}
+{{< markdownify >}}
+
+data 为集合的 ID。
+
+```json
+{
+  "code": 200,
+  "statusText": "",
+  "message": "",
+  "data": {
+    "collectionId": "6646fcedfabd823cdc6de746",
+    "insertLen": 3
+  }
+}
+```
+
+{{< /markdownify >}}
+{{< /tab >}}
+{{< /tabs >}}
+
 ### 获取集合列表
 
 {{< tabs tabTotal="3" >}}
 
@@ -35,8 +35,11 @@ curl --location --request POST 'https://{{host}}/api/admin/clearInvalidData' \
 ## V4.8.1 更新说明
 
 1. 新增 - 知识库重新选择向量模型重建
-2. 新增 - 工作流节点版本变更提示，并可以同步最新版本。
-3. 优化 - 插件输入的 debug 模式，支持全量参数输入渲染。
-4. 修复 - 插件输入默认值被清空问题。
-5. 修复 - 工作流删除节点的动态输入和输出时候，没有正确的删除连接线，导致可能出现逻辑异常。
-6. 修复 - 定时器清理脏数据任务
+2. 新增 - 对话框支持问题模糊检索提示，可自定义预设问题词库。
+3. 新增 - 工作流节点版本变更提示，并可以同步最新版本配置，避免存在隐藏脏数据。
+4. 新增 - 开放文件导入知识库接口到开源版， [点击插件文档](/docs/development/openapi/dataset/#创建一个文件集合)
+5. 新增 - 外部文件源知识库, [点击查看文档](/docs/course/externalfile/)
+6. 优化 - 插件输入的 debug 模式，支持全量参数输入渲染。
+7. 修复 - 插件输入默认值被清空问题。
+8. 修复 - 工作流删除节点的动态输入和输出时候，没有正确的删除连接线，导致可能出现逻辑异常。
+9. 修复 - 定时器清理脏数据任务
@@ -26,18 +26,27 @@ export type DatasetCollectionChunkMetadataType = {
   qaPrompt?: string;
   metadata?: Record<string, any>;
 };
+
+// create collection params
 export type CreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
   datasetId: string;
   name: string;
-  type: `${DatasetCollectionTypeEnum}`;
+  type: DatasetCollectionTypeEnum;
+
+  tags?: string[];
+
   fileId?: string;
   rawLink?: string;
+  externalFileId?: string;
+
+  externalFileUrl?: string;
   rawTextLength?: number;
   hashRawText?: string;
 };
 
 export type ApiCreateDatasetCollectionParams = DatasetCollectionChunkMetadataType & {
   datasetId: string;
+  tags?: string[];
 };
 export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
   name: string;
@@ -58,6 +67,11 @@ export type CsvTableCreateDatasetCollectionParams = {
   parentId?: string;
   fileId: string;
 };
+export type ExternalFileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
+  externalFileId?: string;
+  externalFileUrl: string;
+  filename?: string;
+};
 
 /* ================= data ===================== */
 export type PgSearchRawType = {
 
@@ -1,4 +1,4 @@
-/* sourceId = prefix-id; id=fileId;link url;externalId */
+/* sourceId = prefix-id; id=fileId;link url;externalFileId */
 export enum CollectionSourcePrefixEnum {
   local = 'local',
   link = 'link',
 
@@ -0,0 +1,14 @@
+import { CollectionWithDatasetType, DatasetCollectionSchemaType } from '../type';
+
+export const getCollectionSourceData = (
+  collection?: CollectionWithDatasetType | DatasetCollectionSchemaType
+) => {
+  return {
+    sourceId:
+      collection?.fileId ||
+      collection?.rawLink ||
+      collection?.externalFileId ||
+      collection?.externalFileUrl,
+    sourceName: collection?.name || ''
+  };
+};
@@ -22,7 +22,7 @@ export const DatasetTypeMap = {
     collectionLabel: 'common.Website'
   },
   [DatasetTypeEnum.externalFile]: {
-    icon: 'core/dataset/commonDataset',
+    icon: 'core/dataset/externalDataset',
     label: 'External File',
     collectionLabel: 'common.File'
   }
@@ -44,9 +44,11 @@ export const DatasetStatusMap = {
 /* ------------ collection -------------- */
 export enum DatasetCollectionTypeEnum {
   folder = 'folder',
+  virtual = 'virtual',
+
   file = 'file',
   link = 'link', // one link
-  virtual = 'virtual'
+  externalFile = 'externalFile'
 }
 export const DatasetCollectionTypeMap = {
   [DatasetCollectionTypeEnum.folder]: {
@@ -55,6 +57,9 @@ export const DatasetCollectionTypeMap = {
   [DatasetCollectionTypeEnum.file]: {
     name: 'core.dataset.file'
   },
+  [DatasetCollectionTypeEnum.externalFile]: {
+    name: 'core.dataset.externalFile'
+  },
   [DatasetCollectionTypeEnum.link]: {
     name: 'core.dataset.link'
   },
 
@@ -1,7 +1,5 @@
 import { DatasetSourceReadTypeEnum, ImportDataSourceEnum } from './constants';
 
-export const rawTextBackupPrefix = 'index,content';
-
 export const importType2ReadType = (type: ImportDataSourceEnum) => {
   if (type === ImportDataSourceEnum.csvTable || type === ImportDataSourceEnum.fileLocal) {
     return DatasetSourceReadTypeEnum.fileLocal;
 
@@ -41,7 +41,7 @@ export type DatasetCollectionSchemaType = {
   datasetId: string;
   parentId?: string;
   name: string;
-  type: `${DatasetCollectionTypeEnum}`;
+  type: DatasetCollectionTypeEnum;
   createTime: Date;
   updateTime: Date;
 
@@ -50,13 +50,15 @@ export type DatasetCollectionSchemaType = {
   chunkSplitter?: string;
   qaPrompt?: string;
 
-  sourceId?: string; // relate CollectionSourcePrefixEnum
+  tags?: string[];
+
   fileId?: string; // local file id
   rawLink?: string; // link url
+  externalFileId?: string; //external file id
 
   rawTextLength?: number;
   hashRawText?: string;
-  externalSourceUrl?: string; // external import url
+  externalFileUrl?: string; // external import url
   metadata?: {
     webPageSelector?: string;
     relatedImgId?: string; // The id of the associated image collections
 
@@ -3,7 +3,7 @@ import { getFileIcon } from '../../common/file/icon';
 import { strIsLink } from '../../common/string/tools';
 
 export function getCollectionIcon(
-  type: `${DatasetCollectionTypeEnum}` = DatasetCollectionTypeEnum.file,
+  type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
   name = ''
 ) {
   if (type === DatasetCollectionTypeEnum.folder) {
@@ -24,13 +24,13 @@ export function getSourceNameIcon({
   sourceName: string;
   sourceId?: string;
 }) {
-  if (strIsLink(sourceId)) {
-    return 'common/linkBlue';
-  }
-  const fileIcon = getFileIcon(sourceName, '');
+  const fileIcon = getFileIcon(decodeURIComponent(sourceName), '');
   if (fileIcon) {
     return fileIcon;
   }
+  if (strIsLink(sourceId)) {
+    return 'common/linkBlue';
+  }
 
   return 'file/fill/manual';
 }
 
@@ -10,7 +10,7 @@
     "js-yaml": "^4.1.0",
     "jschardet": "3.1.1",
     "nanoid": "^4.0.1",
-    "next": "13.5.2",
+    "next": "14.2.3",
     "openai": "4.28.0",
     "openapi-types": "^12.1.3",
     "timezones-list": "^3.0.2"
 
@@ -7,7 +7,7 @@ import { MongoFileSchema } from './schema';
 import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
 import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
 import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
-import { readFileRawContent } from '../read/utils';
+import { readRawContentByFileBuffer } from '../read/utils';
 import { PassThrough } from 'stream';
 
 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
@@ -196,7 +196,7 @@ export const readFileContentFromMongo = async ({
     });
   })();
 
-  const { rawText } = await readFileRawContent({
+  const { rawText } = await readRawContentByFileBuffer({
     extension,
     isQAImport,
     teamId,
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1 @@`
`1`	`1`	`public-hoist-pattern[]=tiktoken`
`2`		`-public-hoist-pattern[]=react`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-/* sourceId = prefix-id; id=fileId;link url;externalId */`
	`1`	`+/* sourceId = prefix-id; id=fileId;link url;externalFileId */`
`2`	`2`	`export enum CollectionSourcePrefixEnum {`
`3`	`3`	`local = 'local',`
`4`	`4`	`link = 'link',`