Merge branch 'test'

shing-yu · Oct 16, 2023 · aaba149 · aaba149
2 parents 284beb5 + 9762041
commit aaba149
Show file tree

Hide file tree

Showing 9 changed files with 277 additions and 1,262 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
 # 忽略IDE文件
 .idea/
 src/__pycache__/
-src/setup/__pycache__/
+src/list_edition/__pycache__/
diff --git a/src/setup/fanqie_update.py → src/list_edition/fanqie_list.py b/src/setup/fanqie_update.py → src/list_edition/fanqie_list.py
@@ -13,79 +13,30 @@
 本软件提供的是按"原样"提供的，没有任何明示或暗示的保证，包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内，作者明确放弃了所有明示或暗示的担保和条件。
 
 免责声明：
-该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和版权持有人无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
+该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
 
 请在使用本程序之前确保遵守相关法律法规和网站的使用政策，如有疑问，请咨询法律顾问。
 
 无论您对程序进行了任何操作，请始终保留此信息。
 """
+import os
 
 # 导入必要的模块
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
-import datetime
 import re
-import os
-
-
-# 定义番茄更新函数
-def fanqie_update(user_agent, data_folder):
-    # 指定小说文件夹
-    novel_folder = "小说"
-
-    novel_files = [file for file in os.listdir(novel_folder) if file.endswith(".txt")]
+import datetime
+from os import path
+import time
+import public as p
 
-    if not novel_files:
-        print("没有可更新的文件")
-        return
 
-    no_corresponding_files = True  # 用于标记是否存在对应的txt和upd文件
-
-    for txt_file in novel_files:
-        txt_file_path = os.path.join(novel_folder, txt_file)
-        upd_file_path = os.path.join(data_folder, txt_file.replace(".txt", ".upd"))
-        novel_name = txt_file.replace(".txt", "")
-
-        if os.path.exists(upd_file_path):
-
-            print(f"正在尝试更新: {novel_name}")
-            # 读取用于更新的文件元数据
-            with open(upd_file_path, 'r') as file:
-                lines = file.readlines()
-
-            # 保存上次更新时间和上次章节id到变量
-            last_update_time = lines[0].strip()
-            url = lines[1].strip()
-            last_chapter_id = lines[2].strip()
-            encoding = lines[3].strip()
-            print(f"上次更新时间{last_update_time}")
-            result = download_novel(url, encoding, user_agent, last_chapter_id, txt_file_path)
-            if result == "DN":
-                print(f"{novel_name} 已是最新，不需要更新。\n")
-            else:
-                print(f"{novel_name} 已更新完成。\n")
-                # 获取当前系统时间
-                current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                # 创建要写入元信息文件的内容
-                new_content = f"{current_time}\n{url}\n{result}\n{encoding}"
-                # 打开文件并完全覆盖内容
-                with open(upd_file_path, "w") as file:
-                    file.write(new_content)
-
-            no_corresponding_files = False
-        else:
-            print(f"{novel_name} 不是通过此工具下载，无法更新")
-
-    if no_corresponding_files:
-        print("没有可更新的文件")
-
-
-# 定义更新番茄小说的函数
-def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path):
+# 定义正常模式用来下载番茄小说的函数
+def fanqie_l(url, encoding, output_queue):
 
     headers = {
-        "User-Agent": user_agent
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
     }
 
     # 获取网页源码
@@ -95,27 +46,44 @@ def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path):
     # 解析网页源码
     soup = BeautifulSoup(html, "html.parser")
 
+    # 获取小说标题
+    title = soup.find("h1").get_text()
+    # , class_ = "info-name"
+    # 替换非法字符
+    title = p.rename(title)
+
+    output_queue.put(f"正在获取: {title}")
+
+    # 获取小说信息
+    info = soup.find("div", class_="page-header-info").get_text()
+
+    # 获取小说简介
+    intro = soup.find("div", class_="page-abstract-content").get_text()
+
+    # 拼接小说内容字符串
+    content = f"""如果需要小说更新，请勿修改文件名
+使用 @星隅(xing-yv) 所作开源工具下载
+开源仓库地址:https://github.com/xing-yv/fanqie-novel-download
+Gitee:https://gitee.com/xingyv1024/fanqie-novel-download/
+任何人无权限制您访问本工具，如果有向您提供代下载服务者未事先告知您工具的获取方式，请向作者举报:[email protected]
+
+{title}
+{info}
+{intro}
+"""
+
     # 获取所有章节链接
     chapters = soup.find_all("div", class_="chapter-item")
 
-    last_chapter_id = None
-    # 找到起始章节的索引
-    start_index = 0
-    for i, chapter in enumerate(chapters):
-        chapter_url = urljoin(url, chapter.find("a")["href"])
-        chapter_id_tmp = re.search(r"/(\d+)", chapter_url).group(1)
-        if chapter_id_tmp == start_chapter_id:  # 更新函数，所以前进一个章节
-            start_index = i + 1
-        last_chapter_id = chapter_id_tmp
-
-    # 判断是否已经最新
-    if start_index >= len(chapters):
-        return "DN"  # 返回Don't Need.
-
-    # 打开文件
-    with open(txt_file_path, 'ab') as f:
-        # 从起始章节开始遍历每个章节链接
-        for chapter in chapters[start_index:]:
+    # 定义文件名
+    file_path = path.join('output', f'{title}.txt')
+
+    os.makedirs("output", exist_ok=True)
+
+    try:
+        # 遍历每个章节链接
+        for chapter in chapters:
+            time.sleep(1)
             # 获取章节标题
             chapter_title = chapter.find("a").get_text()
 
@@ -143,12 +111,12 @@ def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path):
                     break  # 如果成功获取章节内容，跳出重试循环
                 else:
                     if retry_count == 1:
-                        print(f"{chapter_title} 获取失败，正在尝试重试...")
-                    print(f"第 ({retry_count}/3) 次重试获取章节内容")
+                        output_queue.put(f"{chapter_title} 获取失败，正在尝试重试...")
+                    output_queue.put(f"第 ({retry_count}/3) 次重试获取章节内容")
                     retry_count += 1  # 否则重试
 
             if retry_count == 4:
-                print(f"无法获取章节内容: {chapter_title}，跳过。")
+                output_queue.put(f"无法获取章节内容: {chapter_title}，跳过。")
                 continue  # 重试次数过多后，跳过当前章节
 
             # 提取文章标签中的文本
@@ -160,17 +128,40 @@ def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path):
             # 去除其他 html 标签
             chapter_text = re.sub(r"</?\w+>", "", chapter_text)
 
+            chapter_text = p.fix_publisher(chapter_text)
+
             # 在小说内容字符串中添加章节标题和内容
-            content = f"\n\n\n{chapter_title}\n{chapter_text}"
+            content += f"\n\n\n{chapter_title}\n{chapter_text}"
 
-            # 根据编码转换小说内容字符串为二进制数据
-            data = content.encode(encoding, errors='ignore')
+            # 打印进度信息
+            output_queue.put(f"已获取 {chapter_title}")
+
+        # 根据编码转换小说内容字符串为二进制数据
+        data = content.encode(encoding, errors='ignore')
 
-            # 将数据追加到文件中
+        # 保存文件
+        with open(file_path, "wb") as f:
             f.write(data)
 
-            # 打印进度信息
-            print(f"已增加: {chapter_title}")
+        # 打印完成信息
+        output_queue.put(f"已保存{title}.txt")
 
-    # 返回更新完成
-    return last_chapter_id
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        output_queue.put(f"完成时间:{current_time}")
+
+    except Exception as e:
+        # 捕获所有异常，及时保存文件
+        output_queue.put(f"发生异常: \n{e}")
+        output_queue.put("正在尝试保存文件...")
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        output_queue.put(f"{current_time}")
+        # 根据转换小说内容字符串为二进制数据
+        data = content.encode(encoding, errors='ignore')
+
+        # 保存文件
+        with open(file_path, "wb") as f:
+            f.write(data)
+
+        output_queue.put("文件已保存！")
+        return
diff --git a/src/list_edition/list_edition.py b/src/list_edition/list_edition.py
@@ -0,0 +1,159 @@
+"""
+作者：星隅（xing-yv）
+
+版权所有（C）2023 星隅（xing-yv）
+
+本软件根据GNU通用公共许可证第三版（GPLv3）发布；
+你可以在以下位置找到该许可证的副本：
+https://www.gnu.org/licenses/gpl-3.0.html
+
+根据GPLv3的规定，您有权在遵循许可证的前提下自由使用、修改和分发本软件。
+请注意，根据许可证的要求，任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。
+
+本软件提供的是按"原样"提供的，没有任何明示或暗示的保证，包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内，作者明确放弃了所有明示或暗示的担保和条件。
+
+免责声明：
+该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
+
+请在使用本程序之前确保遵守相关法律法规和网站的使用政策，如有疑问，请咨询法律顾问。
+
+无论您对程序进行了任何操作，请始终保留此信息。
+"""
+
+import queue
+import threading
+import tkinter as tk
+import tkinter.messagebox
+from multiprocessing import Process, Queue
+import time
+import fanqie_list as fl
+
+
+class Spider:
+    def __init__(self, output_func, output_queue):
+        self.url_queue = queue.Queue()
+        self.output_queue = output_queue
+        self.is_running = True
+        self.output_func = output_func
+
+    @staticmethod
+    def crawl(url, output_queue):
+        # 创建一个新的进程来运行爬虫函数
+        p = Process(target=fl.fanqie_l, args=(url, 'utf-8', output_queue))
+        p.start()
+        time.sleep(2)
+
+    def worker(self):
+        while self.is_running:
+            try:
+                url = self.url_queue.get(timeout=1)
+                Spider.crawl(url, self.output_queue)
+                self.url_queue.task_done()
+            except queue.Empty:
+                continue
+
+    def start(self):
+        threading.Thread(target=self.worker, daemon=True).start()
+
+    def add_url(self, url):
+        if "/page/" not in url:
+            tkinter.messagebox.showinfo("错误", "URL格式不正确，请重新输入")
+            return
+        else:
+            self.url_queue.put(url)
+            tkinter.messagebox.showinfo("成功", "URL已添加到下载队列")
+
+    def stop(self):
+        self.is_running = False
+
+
+def main():
+    root = tk.Tk()
+    root.title("番茄工具队列版")
+
+    # 设置窗口大小
+    root.geometry("600x400")
+
+    output_text = tk.Text(root, state='disabled')
+    output_text.pack()
+
+    # 创建滚动条
+    scrollbar = tk.Scrollbar(root, command=output_text.yview)
+    scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
+
+    # 设置Text组件的yscrollcommand为滚动条的set方法
+    output_text.config(yscrollcommand=scrollbar.set)
+
+    # 手动调整滚动条的位置
+    scrollbar.place(x=580, y=0, height=320)
+
+    input_frame = tk.Frame(root)
+    input_frame.pack()
+    input_entry = tk.Entry(input_frame, width=50)
+    input_entry.pack(side=tk.LEFT)
+
+    # 调整输入框的位置
+    input_frame.place(x=50, y=350)
+
+    def paste_text():
+        input_entry.event_generate('<<Paste>>')
+
+    def show_context_menu(event):
+        context_menu.post(event.x_root, event.y_root)
+
+    # 创建上下文菜单
+    context_menu = tk.Menu(root, tearoff=0)
+    context_menu.add_command(label="粘贴", command=paste_text)
+
+    # 绑定上下文菜单到输入框
+    input_entry.bind("<Button-3>", show_context_menu)
+
+    input_entry.bind("<Button-3>", show_context_menu)
+
+    output_queue = Queue()
+
+    spider = Spider(lambda text: output_text.config(state='normal') or output_text.insert(tk.END, text + "\n") or output_text.config(state='disabled'), output_queue)
+
+    spider.start()
+
+    def add_url():
+        url = input_entry.get()
+        spider.add_url(url)
+        input_entry.delete(0, tk.END)
+
+    add_button = tk.Button(input_frame, text="添加URL", command=add_url)
+    add_button.pack(side=tk.LEFT)
+
+    def list_urls():
+        urls = list(spider.url_queue.queue)
+        tkinter.messagebox.showinfo("队列中的URL", "\n".join(urls))
+
+    list_button = tk.Button(input_frame, text="列出URL", command=list_urls)
+    list_button.pack(side=tk.LEFT)
+
+    def stop_spider():
+        spider.stop()
+        root.quit()
+
+    stop_button = tk.Button(input_frame, text="退出", command=stop_spider)
+    stop_button.pack(side=tk.LEFT)
+
+    def check_output_queue():
+        while not output_queue.empty():
+            message = output_queue.get()
+            output_text.config(state='normal')
+            output_text.insert(tk.END, message + "\n")
+            output_text.config(state='disabled')
+
+            # 滚动到最后一行
+            output_text.see(tk.END)
+
+        root.after(100, check_output_queue)  # 每秒检查一次输出队列
+
+    check_output_queue()
+
+    root.mainloop()
+
+
+if __name__ == "__main__":
+    main()