Merge branch 'main' of https://github.com/lobehub/lobe-chat

yuanze-dev · Mar 6, 2025 · 84c6cc1 · 84c6cc1
2 parents cb6e51c + 00a33bf
commit 84c6cc1
Show file tree

Hide file tree

Showing 11 changed files with 407 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,31 @@
 
 # Changelog
 
+### [Version 1.68.9](https://github.com/lobehub/lobe-chat/compare/v1.68.8...v1.68.9)
+
+<sup>Released on **2025-03-05**</sup>
+
+#### 💄 Styles
+
+- **misc**: Add epub file chunk split support.
+
+<br/>
+
+<details>
+<summary><kbd>Improvements and Fixes</kbd></summary>
+
+#### Styles
+
+- **misc**: Add epub file chunk split support, closes [#6317](https://github.com/lobehub/lobe-chat/issues/6317) ([a79ab7a](https://github.com/lobehub/lobe-chat/commit/a79ab7a))
+
+</details>
+
+<div align="right">
+
+[![](https://img.shields.io/badge/-BACK_TO_TOP-151515?style=flat-square)](#readme-top)
+
+</div>
+
 ### [Version 1.68.8](https://github.com/lobehub/lobe-chat/compare/v1.68.7...v1.68.8)
 
 <sup>Released on **2025-03-05**</sup>

diff --git a/changelog/v1.json b/changelog/v1.json
@@ -1,4 +1,11 @@
 [
+  {
+    "children": {
+      "improvements": ["Add epub file chunk split support."]
+    },
+    "date": "2025-03-05",
+    "version": "1.68.9"
+  },
   {
     "children": {
       "improvements": ["Improve openrouter models info."]

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@lobehub/chat",
-  "version": "1.68.8",
+  "version": "1.68.9",
   "description": "Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.",
   "keywords": [
     "framework",
@@ -162,10 +162,12 @@
     "diff": "^7.0.0",
     "drizzle-orm": "^0.40.0",
     "drizzle-zod": "^0.5.1",
+    "epub2": "^3.0.2",
     "fast-deep-equal": "^3.1.3",
     "file-type": "^20.0.0",
     "framer-motion": "^11.16.0",
     "gpt-tokenizer": "^2.8.1",
+    "html-to-text": "^9.0.5",
     "i18next": "^24.2.1",
     "i18next-browser-languagedetector": "^8.0.2",
     "i18next-resources-to-backend": "^1.2.1",

diff --git a/src/database/client/migrations.json b/src/database/client/migrations.json
@@ -223,7 +223,10 @@
     "hash": "9646161fa041354714f823d726af27247bcd6e60fa3be5698c0d69f337a5700b"
   },
   {
-    "sql": ["DROP TABLE \"user_budgets\";", "\nDROP TABLE \"user_subscriptions\";"],
+    "sql": [
+      "DROP TABLE \"user_budgets\";",
+      "\nDROP TABLE \"user_subscriptions\";"
+    ],
     "bps": true,
     "folderMillis": 1729699958471,
     "hash": "7dad43a2a25d1aec82124a4e53f8d82f8505c3073f23606c1dc5d2a4598eacf9"
@@ -295,7 +298,9 @@
     "hash": "845a692ceabbfc3caf252a97d3e19a213bc0c433df2689900135f9cfded2cf49"
   },
   {
-    "sql": ["ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;"],
+    "sql": [
+      "ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;"
+    ],
     "bps": true,
     "folderMillis": 1737609172353,
     "hash": "2cb36ae4fcdd7b7064767e04bfbb36ae34518ff4bb1b39006f2dd394d1893868"
@@ -309,4 +314,4 @@
     "folderMillis": 1739901891891,
     "hash": "78d8fefd8c58938d7bc3da2295a73b35ce2e8d7cb2820f8e817acdb8dd5bebb2"
   }
-]
+]
diff --git a/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap b/src/libs/langchain/loaders/epub/__tests__/__snapshots__/index.test.ts.snap
@@ -0,0 +1,238 @@
+// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
+
+exports[`EPubLoader > should run 1`] = `
+[
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 1,
+          "to": 13,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "HEFTY WATER
+
+This document serves to test Reading System support for the epub:switch
+[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-content-switch]
+element. There is also a little bit of ruby markup
+[http://www.w3.org/TR/html5/the-ruby-element.html#the-ruby-element] available.
+
+
+THE SWITCH
+
+Below is an instance of the epub:switch element, containing Chemical Markup
+Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
+fallback content is a chunk of plain XHTML5.",
+  },
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 9,
+          "to": 22,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "THE SWITCH
+
+Below is an instance of the epub:switch element, containing Chemical Markup
+Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
+fallback content is a chunk of plain XHTML5.
+
+ * If your Reading System supports epub:switch and CML, it will render the CML
+   formula natively, and ignore (a.k.a not display) the XHTML fallback.
+ * If your Reading System supports epub:switch but not CML, it will ignore (not
+   display) the CML formula, and render the the XHTML fallback instead.
+ * If your Reading System does not support epub:switch at all, then the
+   rendering results are somewhat unpredictable, but the most likely result is
+   that it will display both a failed attempt to render the CML and the XHTML
+   fallback.",
+  },
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 24,
+          "to": 43,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "Note: the XHTML fallback is bold and enclosed in a gray dotted box with a
+slightly gray background. A failed CML rendering will most likely appear above
+the gray fallback box and read:
+"H hydrogen O oxygen hefty H O water".
+
+Here the switch begins...
+
+
+H hydrogen O oxygen hefty H O water
+
+2H2 + O2 ⟶ 2H2O
+
+... and here the switch ends.
+
+
+THE SOURCE
+
+Below is a rendition of the source code of the switch element. Your Reading
+System should display this correctly regardless of whether it supports the
+switch element.",
+  },
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 46,
+          "to": 66,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "<switch xmlns="http://www.idpf.org/2007/ops">
+    <case required-namespace="http://www.xml-cml.org/schema">
+        <chem xmlns="http://www.xml-cml.org/schema">
+            <reaction>
+                <molecule n="2">
+                    <atom n="2"> H </atom>
+                    <caption> hydrogen </caption>
+                </molecule>
+                <plus></plus>
+                <molecule>
+                    <atom n="2"> O </atom>
+                    <caption> oxygen </caption>
+                </molecule>
+                <gives>
+                    <caption> hefty </caption>
+                </gives>
+                <molecule n="2">
+                    <atom n="2"> H </atom>
+                    <atom> O </atom>
+                    <caption> water </caption>
+                </molecule>",
+  },
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 57,
+          "to": 79,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "<caption> oxygen </caption>
+                </molecule>
+                <gives>
+                    <caption> hefty </caption>
+                </gives>
+                <molecule n="2">
+                    <atom n="2"> H </atom>
+                    <atom> O </atom>
+                    <caption> water </caption>
+                </molecule>
+            </reaction>
+        </chem>
+    </case>
+    <default>
+        <p xmlns="http://www.w3.org/1999/xhtml" id="fallback">
+            <span>2H<sub>2</sub></span>
+            <span>+</span>
+            <span>O<sub>2</sub></span>
+            <span>⟶</span>
+            <span>2H<sub>2</sub>O</span>
+        </p>
+    </default>                
+</switch>",
+  },
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 84,
+          "to": 94,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "HEFTY RUBY WATER
+
+While the ruby element is mostly used in east-asian languages, it can also be
+useful in other contexts. As an example, and as you can see in the source of the
+CML element above, the code includes a caption element which is intended to be
+displayed below the formula segments. Following this paragraph is a reworked
+version of the XHTML fallback used above, using the ruby element. If your
+Reading System does not support ruby markup, then the captions will appear in
+parentheses on the same line as the formula segments.
+
+2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)",
+  },
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 94,
+          "to": 111,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)
+
+If your Reading System in addition to supporting ruby markup also supports the
+-epub-ruby-position
+[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-css-ruby-position]
+property, then the captions will appear under the formula segments instead of
+over them.
+
+The source code for the ruby version of the XHTML fallback looks as follows:
+
+
+<p id="rubyp">
+    <ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
+    <span>+</span>
+    <ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
+    <ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
+    <ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
+</p>",
+  },
+  Document {
+    "id": undefined,
+    "metadata": {
+      "loc": {
+        "lines": {
+          "from": 105,
+          "to": 120,
+        },
+      },
+      "source": "",
+    },
+    "pageContent": "<p id="rubyp">
+    <ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
+    <span>+</span>
+    <ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
+    <ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
+    <ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
+</p>                
+            
+
+... and the css declaration using the -epub-ruby-position property looks like
+this:
+
+
+p#rubyp {
+    -epub-ruby-position : under;
+}",
+  },
+]
+`;
diff --git a/src/libs/langchain/loaders/epub/__tests__/demo.epub b/src/libs/langchain/loaders/epub/__tests__/demo.epub
diff --git a/src/libs/langchain/loaders/epub/__tests__/index.test.ts b/src/libs/langchain/loaders/epub/__tests__/index.test.ts
@@ -0,0 +1,24 @@
+// @vitest-environment node
+import * as fs from 'node:fs';
+import { join } from 'node:path';
+import { expect } from 'vitest';
+
+import { EPubLoader } from '../index';
+
+function sanitizeDynamicFields(document: any[]) {
+  for (const doc of document) {
+    doc.metadata.source && (doc.metadata.source = '');
+  }
+  return document;
+}
+
+describe('EPubLoader', () => {
+  it('should run', async () => {
+    const content = fs.readFileSync(join(__dirname, `./demo.epub`));
+
+    const fileContent: Uint8Array = new Uint8Array(content);
+
+    const data = await EPubLoader(fileContent);
+    expect(sanitizeDynamicFields(data)).toMatchSnapshot();
+  });
+});
diff --git a/src/libs/langchain/loaders/epub/index.ts b/src/libs/langchain/loaders/epub/index.ts
@@ -0,0 +1,21 @@
+import { EPubLoader as Loader } from '@langchain/community/document_loaders/fs/epub';
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
+import { loaderConfig } from '../config';
+import { TempFileManager } from '@/server/utils/tempFileManager';
+
+export const EPubLoader = async (content: Uint8Array) => {
+  const tempManager = new TempFileManager();
+  try {
+    const tempPath = await tempManager.writeTempFile(content);
+    const loader = new Loader(tempPath);
+    const documents = await loader.load();
+
+    const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
+    return await splitter.splitDocuments(documents);
+  } catch (e) {
+    throw new Error(`EPubLoader error: ${(e as Error).message}`);
+  } finally {
+    tempManager.cleanup(); // 确保清理
+  }
+
+};