feat: integrate jina sum plugin (#16)

hanfangyuan4396 · Apr 19, 2024 · e26eba4 · e26eba4
1 parent a4cbeca
commit e26eba4
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -30,4 +30,5 @@ plugins/banwords/lib/__pycache__
 !plugins/role
 !plugins/keyword
 !plugins/linkai
+!plugins/jina_sum
 client_config.json
diff --git a/plugins/jina_sum/.gitignore b/plugins/jina_sum/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+config.json
diff --git a/plugins/jina_sum/LICENSE b/plugins/jina_sum/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Han Fangyuan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/plugins/jina_sum/README.md b/plugins/jina_sum/README.md
@@ -0,0 +1,20 @@
+# jina_sumary
+ChatGPT on WeChat项目插件, 使用jina reader和ChatGPT总结网页链接内容
+
+支持总结公众号、小红书、csdn等分享卡片链接(有的卡片链接会触发验证，一般直链没有此问题)
+
+![wechat_mp](./docs/images/wechat_mp.jpg)
+![red](./docs/images/red.jpg)
+![csdn](./docs/images/csdn.jpg)
+
+config.json 配置说明
+```bash
+{
+  "jina_reader_base": "https://r.jina.ai",           # jina reader链接，默认为https://r.jina.ai
+  "open_ai_api_base": "https://api.openai.com/v1",   # chatgpt chat url
+  "open_ai_api_key":  "sk-xxx",                      # chatgpt api key
+  "open_ai_model": "gpt-3.5-turbo",                  # chatgpt model
+  "max_words": 8000,                                 # 网页链接内容的最大字数，防止超过最大输入token，使用字符串长度简单计数
+  "prompt": "我需要对下面的文本进行总结，总结输出包括以下三个部分：\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动。"                           # 链接内容总结提示词
+}
+```
diff --git a/plugins/jina_sum/__init__.py b/plugins/jina_sum/__init__.py
@@ -0,0 +1 @@
+from .jina_sum import *
diff --git a/plugins/jina_sum/config.json.template b/plugins/jina_sum/config.json.template
@@ -0,0 +1,8 @@
+{
+  "jina_reader_base": "https://r.jina.ai",
+  "open_ai_api_base": "https://api.openai.com/v1",
+  "open_ai_api_key":  "sk-xxx",
+  "open_ai_model": "gpt-3.5-turbo",
+  "max_words": 8000,
+  "prompt": "我需要对下面的文本进行总结，总结输出包括以下三个部分：\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动。"
+}
diff --git a/plugins/jina_sum/jina_sum.py b/plugins/jina_sum/jina_sum.py
@@ -0,0 +1,130 @@
+# encoding:utf-8
+import json
+import os
+import html
+from urllib.parse import urlparse
+
+import requests
+
+import plugins
+from bridge.context import ContextType
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from plugins import *
+
+@plugins.register(
+    name="JinaSum",
+    desire_priority=10,
+    hidden=False,
+    desc="Sum url link content with jina reader and llm",
+    version="v0.0.1",
+    author="hanfangyuan",
+)
+class JinaSum(Plugin):
+
+    jina_reader_base = "https://r.jina.ai"
+    open_ai_api_base = "https://api.openai.com/v1"
+    open_ai_model = "gpt-3.5-turbo"
+    max_words = 8000
+    prompt = "我需要对下面引号内文档进行总结，总结输出包括以下三个部分：\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动\n\n"
+
+    def __init__(self):
+        super().__init__()
+        try:
+            self.config = super().load_config()
+            if not self.config:
+                self.config = self._load_config_template()
+            self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
+            self.open_ai_api_base = self.config.get("open_ai_api_base", self.open_ai_api_base)
+            self.open_ai_api_key = self.config.get("open_ai_api_key", "")
+            self.open_ai_model = self.config.get("open_ai_model", self.open_ai_model)
+            self.max_words = self.config.get("max_words", self.max_words)
+            self.prompt = self.config.get("prompt", self.prompt)
+            logger.info(f"[JinaSum] inited, config={self.config}")
+            self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
+        except Exception as e:
+            logger.error(f"[JinaSum] 初始化异常：{e}")
+            raise "[JinaSum] init failed, ignore "
+
+    def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
+        try:
+            context = e_context["context"]
+            content = context.content
+            if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
+                return
+            if not self._check_url(content):
+                logger.debug(f"[JinaSum] {content} not a url, skip")
+                return
+            if retry_count == 0:
+                logger.debug("[JinaSum] on_handle_context. content: %s" % content)
+                reply = Reply(ReplyType.TEXT, "🎉正在为您生成总结，请稍候...")
+                channel = e_context["channel"]
+                channel.send(reply, context)
+
+            target_url = html.unescape(content) # 解决公众号卡片链接校验问题，参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45
+            jina_url = self._get_jina_url(target_url)
+            response = requests.get(jina_url, timeout=60)
+            response.raise_for_status()
+            target_url_content = response.text
+
+            openai_chat_url = self._get_openai_chat_url()
+            openai_headers = self._get_openai_headers()
+            openai_payload = self._get_openai_payload(target_url_content)
+            logger.debug(f"[JinaSum] openai_chat_url: {openai_chat_url}, openai_headers: {openai_headers}, openai_payload: {openai_payload}")
+            response = requests.post(openai_chat_url, headers=openai_headers, json=openai_payload, timeout=60)
+            response.raise_for_status()
+            result = response.json()['choices'][0]['message']['content']
+            reply = Reply(ReplyType.TEXT, result)
+            e_context["reply"] = reply
+            e_context.action = EventAction.BREAK_PASS
+
+        except Exception as e:
+            if retry_count < 3:
+                logger.warning(f"[JinaSum] {str(e)}, retry {retry_count + 1}")
+                self.on_handle_context(e_context, retry_count + 1)
+                return
+
+            logger.exception(f"[JinaSum] {str(e)}")
+            reply = Reply(ReplyType.ERROR, "我暂时无法总结链接，请稍后再试")
+            e_context["reply"] = reply
+            e_context.action = EventAction.BREAK_PASS
+
+    def get_help_text(self, verbose, **kwargs):
+        return f'使用jina reader和ChatGPT总结网页链接内容'
+
+    def _load_config_template(self):
+        logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
+        try:
+            plugin_config_path = os.path.join(self.path, "config.json.template")
+            if os.path.exists(plugin_config_path):
+                with open(plugin_config_path, "r", encoding="utf-8") as f:
+                    plugin_conf = json.load(f)
+                    return plugin_conf
+        except Exception as e:
+            logger.exception(e)
+
+    def _get_jina_url(self, target_url):
+        return self.jina_reader_base + "/" + target_url
+
+    def _get_openai_chat_url(self):
+        return self.open_ai_api_base + "/chat/completions"
+
+    def _get_openai_headers(self):
+        return {
+            'Authorization': f"Bearer {self.open_ai_api_key}",
+            'Host': urlparse(self.open_ai_api_base).netloc
+        }
+
+    def _get_openai_payload(self, target_url_content):
+        target_url_content = target_url_content[:self.max_words] # 通过字符串长度简单进行截断
+        sum_prompt = f"{self.prompt}\n\n'''{target_url_content}'''"
+        messages = [{"role": "user", "content": sum_prompt}]
+        payload = {
+            'model': self.open_ai_model,
+            'messages': messages
+        }
+        return payload
+
+    def _check_url(self, target_url: str):
+        # 简单校验是否是url
+        return target_url.strip().startswith("http://") or target_url.strip().startswith("https://")