实现GUI实时队列版

shing-yu · Oct 16, 2023 · 9762041 · 9762041
1 parent 3f0ef8a
commit 9762041
Show file tree

Hide file tree

Showing 3 changed files with 386 additions and 0 deletions.
diff --git a/src/list_edition/fanqie_list.py b/src/list_edition/fanqie_list.py
@@ -0,0 +1,167 @@
+"""
+作者：星隅（xing-yv）
+
+版权所有（C）2023 星隅（xing-yv）
+
+本软件根据GNU通用公共许可证第三版（GPLv3）发布；
+你可以在以下位置找到该许可证的副本：
+https://www.gnu.org/licenses/gpl-3.0.html
+
+根据GPLv3的规定，您有权在遵循许可证的前提下自由使用、修改和分发本软件。
+请注意，根据许可证的要求，任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。
+
+本软件提供的是按"原样"提供的，没有任何明示或暗示的保证，包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内，作者明确放弃了所有明示或暗示的担保和条件。
+
+免责声明：
+该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
+
+请在使用本程序之前确保遵守相关法律法规和网站的使用政策，如有疑问，请咨询法律顾问。
+
+无论您对程序进行了任何操作，请始终保留此信息。
+"""
+import os
+
+# 导入必要的模块
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+import re
+import datetime
+from os import path
+import time
+import public as p
+
+
+# 定义正常模式用来下载番茄小说的函数
+def fanqie_l(url, encoding, output_queue):
+
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
+    }
+
+    # 获取网页源码
+    response = requests.get(url, headers=headers)
+    html = response.text
+
+    # 解析网页源码
+    soup = BeautifulSoup(html, "html.parser")
+
+    # 获取小说标题
+    title = soup.find("h1").get_text()
+    # , class_ = "info-name"
+    # 替换非法字符
+    title = p.rename(title)
+
+    output_queue.put(f"正在获取: {title}")
+
+    # 获取小说信息
+    info = soup.find("div", class_="page-header-info").get_text()
+
+    # 获取小说简介
+    intro = soup.find("div", class_="page-abstract-content").get_text()
+
+    # 拼接小说内容字符串
+    content = f"""如果需要小说更新，请勿修改文件名
+使用 @星隅(xing-yv) 所作开源工具下载
+开源仓库地址:https://github.com/xing-yv/fanqie-novel-download
+Gitee:https://gitee.com/xingyv1024/fanqie-novel-download/
+任何人无权限制您访问本工具，如果有向您提供代下载服务者未事先告知您工具的获取方式，请向作者举报:[email protected]
+
+{title}
+{info}
+{intro}
+"""
+
+    # 获取所有章节链接
+    chapters = soup.find_all("div", class_="chapter-item")
+
+    # 定义文件名
+    file_path = path.join('output', f'{title}.txt')
+
+    os.makedirs("output", exist_ok=True)
+
+    try:
+        # 遍历每个章节链接
+        for chapter in chapters:
+            time.sleep(1)
+            # 获取章节标题
+            chapter_title = chapter.find("a").get_text()
+
+            # 获取章节网址
+            chapter_url = urljoin(url, chapter.find("a")["href"])
+
+            # 获取章节 id
+            chapter_id = re.search(r"/(\d+)", chapter_url).group(1)
+
+            # 构造 api 网址
+            api_url = f"https://novel.snssdk.com/api/novel/book/reader/full/v1/?device_platform=android&parent_enterfrom=novel_channel_search.tab.&aid=2329&platform_id=1&group_id={chapter_id}&item_id={chapter_id}"
+
+            # 尝试获取章节内容
+            chapter_content = None
+            retry_count = 1
+            while retry_count < 4:  # 设置最大重试次数
+                # 获取 api 响应
+                api_response = requests.get(api_url, headers=headers)
+
+                # 解析 api 响应为 json 数据
+                api_data = api_response.json()
+
+                if "data" in api_data and "content" in api_data["data"]:
+                    chapter_content = api_data["data"]["content"]
+                    break  # 如果成功获取章节内容，跳出重试循环
+                else:
+                    if retry_count == 1:
+                        output_queue.put(f"{chapter_title} 获取失败，正在尝试重试...")
+                    output_queue.put(f"第 ({retry_count}/3) 次重试获取章节内容")
+                    retry_count += 1  # 否则重试
+
+            if retry_count == 4:
+                output_queue.put(f"无法获取章节内容: {chapter_title}，跳过。")
+                continue  # 重试次数过多后，跳过当前章节
+
+            # 提取文章标签中的文本
+            chapter_text = re.search(r"<article>([\s\S]*?)</article>", chapter_content).group(1)
+
+            # 将 <p> 标签替换为换行符
+            chapter_text = re.sub(r"<p>", "\n", chapter_text)
+
+            # 去除其他 html 标签
+            chapter_text = re.sub(r"</?\w+>", "", chapter_text)
+
+            chapter_text = p.fix_publisher(chapter_text)
+
+            # 在小说内容字符串中添加章节标题和内容
+            content += f"\n\n\n{chapter_title}\n{chapter_text}"
+
+            # 打印进度信息
+            output_queue.put(f"已获取 {chapter_title}")
+
+        # 根据编码转换小说内容字符串为二进制数据
+        data = content.encode(encoding, errors='ignore')
+
+        # 保存文件
+        with open(file_path, "wb") as f:
+            f.write(data)
+
+        # 打印完成信息
+        output_queue.put(f"已保存{title}.txt")
+
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        output_queue.put(f"完成时间:{current_time}")
+
+    except Exception as e:
+        # 捕获所有异常，及时保存文件
+        output_queue.put(f"发生异常: \n{e}")
+        output_queue.put("正在尝试保存文件...")
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        output_queue.put(f"{current_time}")
+        # 根据转换小说内容字符串为二进制数据
+        data = content.encode(encoding, errors='ignore')
+
+        # 保存文件
+        with open(file_path, "wb") as f:
+            f.write(data)
+
+        output_queue.put("文件已保存！")
+        return
diff --git a/src/list_edition/list_edition.py b/src/list_edition/list_edition.py
@@ -0,0 +1,159 @@
+"""
+作者：星隅（xing-yv）
+
+版权所有（C）2023 星隅（xing-yv）
+
+本软件根据GNU通用公共许可证第三版（GPLv3）发布；
+你可以在以下位置找到该许可证的副本：
+https://www.gnu.org/licenses/gpl-3.0.html
+
+根据GPLv3的规定，您有权在遵循许可证的前提下自由使用、修改和分发本软件。
+请注意，根据许可证的要求，任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。
+
+本软件提供的是按"原样"提供的，没有任何明示或暗示的保证，包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内，作者明确放弃了所有明示或暗示的担保和条件。
+
+免责声明：
+该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
+
+请在使用本程序之前确保遵守相关法律法规和网站的使用政策，如有疑问，请咨询法律顾问。
+
+无论您对程序进行了任何操作，请始终保留此信息。
+"""
+
+import queue
+import threading
+import tkinter as tk
+import tkinter.messagebox
+from multiprocessing import Process, Queue
+import time
+import fanqie_list as fl
+
+
+class Spider:
+    def __init__(self, output_func, output_queue):
+        self.url_queue = queue.Queue()
+        self.output_queue = output_queue
+        self.is_running = True
+        self.output_func = output_func
+
+    @staticmethod
+    def crawl(url, output_queue):
+        # 创建一个新的进程来运行爬虫函数
+        p = Process(target=fl.fanqie_l, args=(url, 'utf-8', output_queue))
+        p.start()
+        time.sleep(2)
+
+    def worker(self):
+        while self.is_running:
+            try:
+                url = self.url_queue.get(timeout=1)
+                Spider.crawl(url, self.output_queue)
+                self.url_queue.task_done()
+            except queue.Empty:
+                continue
+
+    def start(self):
+        threading.Thread(target=self.worker, daemon=True).start()
+
+    def add_url(self, url):
+        if "/page/" not in url:
+            tkinter.messagebox.showinfo("错误", "URL格式不正确，请重新输入")
+            return
+        else:
+            self.url_queue.put(url)
+            tkinter.messagebox.showinfo("成功", "URL已添加到下载队列")
+
+    def stop(self):
+        self.is_running = False
+
+
+def main():
+    root = tk.Tk()
+    root.title("番茄工具队列版")
+
+    # 设置窗口大小
+    root.geometry("600x400")
+
+    output_text = tk.Text(root, state='disabled')
+    output_text.pack()
+
+    # 创建滚动条
+    scrollbar = tk.Scrollbar(root, command=output_text.yview)
+    scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
+
+    # 设置Text组件的yscrollcommand为滚动条的set方法
+    output_text.config(yscrollcommand=scrollbar.set)
+
+    # 手动调整滚动条的位置
+    scrollbar.place(x=580, y=0, height=320)
+
+    input_frame = tk.Frame(root)
+    input_frame.pack()
+    input_entry = tk.Entry(input_frame, width=50)
+    input_entry.pack(side=tk.LEFT)
+
+    # 调整输入框的位置
+    input_frame.place(x=50, y=350)
+
+    def paste_text():
+        input_entry.event_generate('<<Paste>>')
+
+    def show_context_menu(event):
+        context_menu.post(event.x_root, event.y_root)
+
+    # 创建上下文菜单
+    context_menu = tk.Menu(root, tearoff=0)
+    context_menu.add_command(label="粘贴", command=paste_text)
+
+    # 绑定上下文菜单到输入框
+    input_entry.bind("<Button-3>", show_context_menu)
+
+    input_entry.bind("<Button-3>", show_context_menu)
+
+    output_queue = Queue()
+
+    spider = Spider(lambda text: output_text.config(state='normal') or output_text.insert(tk.END, text + "\n") or output_text.config(state='disabled'), output_queue)
+
+    spider.start()
+
+    def add_url():
+        url = input_entry.get()
+        spider.add_url(url)
+        input_entry.delete(0, tk.END)
+
+    add_button = tk.Button(input_frame, text="添加URL", command=add_url)
+    add_button.pack(side=tk.LEFT)
+
+    def list_urls():
+        urls = list(spider.url_queue.queue)
+        tkinter.messagebox.showinfo("队列中的URL", "\n".join(urls))
+
+    list_button = tk.Button(input_frame, text="列出URL", command=list_urls)
+    list_button.pack(side=tk.LEFT)
+
+    def stop_spider():
+        spider.stop()
+        root.quit()
+
+    stop_button = tk.Button(input_frame, text="退出", command=stop_spider)
+    stop_button.pack(side=tk.LEFT)
+
+    def check_output_queue():
+        while not output_queue.empty():
+            message = output_queue.get()
+            output_text.config(state='normal')
+            output_text.insert(tk.END, message + "\n")
+            output_text.config(state='disabled')
+
+            # 滚动到最后一行
+            output_text.see(tk.END)
+
+        root.after(100, check_output_queue)  # 每秒检查一次输出队列
+
+    check_output_queue()
+
+    root.mainloop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/list_edition/public.py b/src/list_edition/public.py
@@ -0,0 +1,60 @@
+"""
+作者：星隅（xing-yv）
+
+版权所有（C）2023 星隅（xing-yv）
+
+本软件根据GNU通用公共许可证第三版（GPLv3）发布；
+你可以在以下位置找到该许可证的副本：
+https://www.gnu.org/licenses/gpl-3.0.html
+
+根据GPLv3的规定，您有权在遵循许可证的前提下自由使用、修改和分发本软件。
+请注意，根据许可证的要求，任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。
+
+本软件提供的是按"原样"提供的，没有任何明示或暗示的保证，包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内，作者明确放弃了所有明示或暗示的担保和条件。
+
+免责声明：
+该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
+
+请在使用本程序之前确保遵守相关法律法规和网站的使用政策，如有疑问，请咨询法律顾问。
+
+无论您对程序进行了任何操作，请始终保留此信息。
+"""
+
+import re
+
+
+# 替换非法字符
+def rename(name):
+    # 定义非法字符的正则表达式模式
+    illegal_characters_pattern = r'[\/:*?"<>|]'
+
+    # 定义替换的中文符号
+    replacement_dict = {
+        '/': '／',
+        ':': '：',
+        '*': '＊',
+        '?': '？',
+        '"': '“',
+        '<': '＜',
+        '>': '＞',
+        '|': '｜'
+    }
+
+    # 使用正则表达式替换非法字符
+    sanitized_path = re.sub(illegal_characters_pattern, lambda x: replacement_dict[x.group(0)], name)
+
+    return sanitized_path
+
+
+def fix_publisher(text):
+    # 针对性去除所有 出版物 所携带的标签
+    text = re.sub(r'<p class=".*?">', '', text)
+    text = re.sub(r'<!--\?xml.*?>', '', text)
+    text = re.sub(r'<link .*?/>', '', text)
+    text = re.sub(r'<meta .*?/>', '', text)
+    text = re.sub(r'<h1 .*?>', '', text)
+    text = re.sub(r'<br/>', '', text)
+    text = re.sub(r'<!DOCTYPE html .*?>', '', text)
+    text = re.sub(r'<span .*?>', '', text)
+    text = re.sub(r'<html .*?>', '', text)
+    return text