This repository has been archived by the owner on Mar 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 77
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
277 additions
and
1,262 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# 忽略IDE文件 | ||
.idea/ | ||
src/__pycache__/ | ||
src/setup/__pycache__/ | ||
src/list_edition/__pycache__/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,79 +13,30 @@ | |
本软件提供的是按"原样"提供的,没有任何明示或暗示的保证,包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内,作者明确放弃了所有明示或暗示的担保和条件。 | ||
免责声明: | ||
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和版权持有人无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。 | ||
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。 | ||
请在使用本程序之前确保遵守相关法律法规和网站的使用政策,如有疑问,请咨询法律顾问。 | ||
无论您对程序进行了任何操作,请始终保留此信息。 | ||
""" | ||
import os | ||
|
||
# 导入必要的模块 | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urljoin | ||
import datetime | ||
import re | ||
import os | ||
|
||
|
||
# 定义番茄更新函数 | ||
def fanqie_update(user_agent, data_folder): | ||
# 指定小说文件夹 | ||
novel_folder = "小说" | ||
|
||
novel_files = [file for file in os.listdir(novel_folder) if file.endswith(".txt")] | ||
import datetime | ||
from os import path | ||
import time | ||
import public as p | ||
|
||
if not novel_files: | ||
print("没有可更新的文件") | ||
return | ||
|
||
no_corresponding_files = True # 用于标记是否存在对应的txt和upd文件 | ||
|
||
for txt_file in novel_files: | ||
txt_file_path = os.path.join(novel_folder, txt_file) | ||
upd_file_path = os.path.join(data_folder, txt_file.replace(".txt", ".upd")) | ||
novel_name = txt_file.replace(".txt", "") | ||
|
||
if os.path.exists(upd_file_path): | ||
|
||
print(f"正在尝试更新: {novel_name}") | ||
# 读取用于更新的文件元数据 | ||
with open(upd_file_path, 'r') as file: | ||
lines = file.readlines() | ||
|
||
# 保存上次更新时间和上次章节id到变量 | ||
last_update_time = lines[0].strip() | ||
url = lines[1].strip() | ||
last_chapter_id = lines[2].strip() | ||
encoding = lines[3].strip() | ||
print(f"上次更新时间{last_update_time}") | ||
result = download_novel(url, encoding, user_agent, last_chapter_id, txt_file_path) | ||
if result == "DN": | ||
print(f"{novel_name} 已是最新,不需要更新。\n") | ||
else: | ||
print(f"{novel_name} 已更新完成。\n") | ||
# 获取当前系统时间 | ||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
# 创建要写入元信息文件的内容 | ||
new_content = f"{current_time}\n{url}\n{result}\n{encoding}" | ||
# 打开文件并完全覆盖内容 | ||
with open(upd_file_path, "w") as file: | ||
file.write(new_content) | ||
|
||
no_corresponding_files = False | ||
else: | ||
print(f"{novel_name} 不是通过此工具下载,无法更新") | ||
|
||
if no_corresponding_files: | ||
print("没有可更新的文件") | ||
|
||
|
||
# 定义更新番茄小说的函数 | ||
def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path): | ||
# 定义正常模式用来下载番茄小说的函数 | ||
def fanqie_l(url, encoding, output_queue): | ||
|
||
headers = { | ||
"User-Agent": user_agent | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" | ||
} | ||
|
||
# 获取网页源码 | ||
|
@@ -95,27 +46,44 @@ def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path): | |
# 解析网页源码 | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
# 获取小说标题 | ||
title = soup.find("h1").get_text() | ||
# , class_ = "info-name" | ||
# 替换非法字符 | ||
title = p.rename(title) | ||
|
||
output_queue.put(f"正在获取: {title}") | ||
|
||
# 获取小说信息 | ||
info = soup.find("div", class_="page-header-info").get_text() | ||
|
||
# 获取小说简介 | ||
intro = soup.find("div", class_="page-abstract-content").get_text() | ||
|
||
# 拼接小说内容字符串 | ||
content = f"""如果需要小说更新,请勿修改文件名 | ||
使用 @星隅(xing-yv) 所作开源工具下载 | ||
开源仓库地址:https://github.com/xing-yv/fanqie-novel-download | ||
Gitee:https://gitee.com/xingyv1024/fanqie-novel-download/ | ||
任何人无权限制您访问本工具,如果有向您提供代下载服务者未事先告知您工具的获取方式,请向作者举报:[email protected] | ||
{title} | ||
{info} | ||
{intro} | ||
""" | ||
|
||
# 获取所有章节链接 | ||
chapters = soup.find_all("div", class_="chapter-item") | ||
|
||
last_chapter_id = None | ||
# 找到起始章节的索引 | ||
start_index = 0 | ||
for i, chapter in enumerate(chapters): | ||
chapter_url = urljoin(url, chapter.find("a")["href"]) | ||
chapter_id_tmp = re.search(r"/(\d+)", chapter_url).group(1) | ||
if chapter_id_tmp == start_chapter_id: # 更新函数,所以前进一个章节 | ||
start_index = i + 1 | ||
last_chapter_id = chapter_id_tmp | ||
|
||
# 判断是否已经最新 | ||
if start_index >= len(chapters): | ||
return "DN" # 返回Don't Need. | ||
|
||
# 打开文件 | ||
with open(txt_file_path, 'ab') as f: | ||
# 从起始章节开始遍历每个章节链接 | ||
for chapter in chapters[start_index:]: | ||
# 定义文件名 | ||
file_path = path.join('output', f'{title}.txt') | ||
|
||
os.makedirs("output", exist_ok=True) | ||
|
||
try: | ||
# 遍历每个章节链接 | ||
for chapter in chapters: | ||
time.sleep(1) | ||
# 获取章节标题 | ||
chapter_title = chapter.find("a").get_text() | ||
|
||
|
@@ -143,12 +111,12 @@ def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path): | |
break # 如果成功获取章节内容,跳出重试循环 | ||
else: | ||
if retry_count == 1: | ||
print(f"{chapter_title} 获取失败,正在尝试重试...") | ||
print(f"第 ({retry_count}/3) 次重试获取章节内容") | ||
output_queue.put(f"{chapter_title} 获取失败,正在尝试重试...") | ||
output_queue.put(f"第 ({retry_count}/3) 次重试获取章节内容") | ||
retry_count += 1 # 否则重试 | ||
|
||
if retry_count == 4: | ||
print(f"无法获取章节内容: {chapter_title},跳过。") | ||
output_queue.put(f"无法获取章节内容: {chapter_title},跳过。") | ||
continue # 重试次数过多后,跳过当前章节 | ||
|
||
# 提取文章标签中的文本 | ||
|
@@ -160,17 +128,40 @@ def download_novel(url, encoding, user_agent, start_chapter_id, txt_file_path): | |
# 去除其他 html 标签 | ||
chapter_text = re.sub(r"</?\w+>", "", chapter_text) | ||
|
||
chapter_text = p.fix_publisher(chapter_text) | ||
|
||
# 在小说内容字符串中添加章节标题和内容 | ||
content = f"\n\n\n{chapter_title}\n{chapter_text}" | ||
content += f"\n\n\n{chapter_title}\n{chapter_text}" | ||
|
||
# 根据编码转换小说内容字符串为二进制数据 | ||
data = content.encode(encoding, errors='ignore') | ||
# 打印进度信息 | ||
output_queue.put(f"已获取 {chapter_title}") | ||
|
||
# 根据编码转换小说内容字符串为二进制数据 | ||
data = content.encode(encoding, errors='ignore') | ||
|
||
# 将数据追加到文件中 | ||
# 保存文件 | ||
with open(file_path, "wb") as f: | ||
f.write(data) | ||
|
||
# 打印进度信息 | ||
print(f"已增加: {chapter_title}") | ||
# 打印完成信息 | ||
output_queue.put(f"已保存{title}.txt") | ||
|
||
# 返回更新完成 | ||
return last_chapter_id | ||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
|
||
output_queue.put(f"完成时间:{current_time}") | ||
|
||
except Exception as e: | ||
# 捕获所有异常,及时保存文件 | ||
output_queue.put(f"发生异常: \n{e}") | ||
output_queue.put("正在尝试保存文件...") | ||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
output_queue.put(f"{current_time}") | ||
# 根据转换小说内容字符串为二进制数据 | ||
data = content.encode(encoding, errors='ignore') | ||
|
||
# 保存文件 | ||
with open(file_path, "wb") as f: | ||
f.write(data) | ||
|
||
output_queue.put("文件已保存!") | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
""" | ||
作者:星隅(xing-yv) | ||
版权所有(C)2023 星隅(xing-yv) | ||
本软件根据GNU通用公共许可证第三版(GPLv3)发布; | ||
你可以在以下位置找到该许可证的副本: | ||
https://www.gnu.org/licenses/gpl-3.0.html | ||
根据GPLv3的规定,您有权在遵循许可证的前提下自由使用、修改和分发本软件。 | ||
请注意,根据许可证的要求,任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。 | ||
本软件提供的是按"原样"提供的,没有任何明示或暗示的保证,包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内,作者明确放弃了所有明示或暗示的担保和条件。 | ||
免责声明: | ||
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。 | ||
请在使用本程序之前确保遵守相关法律法规和网站的使用政策,如有疑问,请咨询法律顾问。 | ||
无论您对程序进行了任何操作,请始终保留此信息。 | ||
""" | ||
|
||
import queue | ||
import threading | ||
import tkinter as tk | ||
import tkinter.messagebox | ||
from multiprocessing import Process, Queue | ||
import time | ||
import fanqie_list as fl | ||
|
||
|
||
class Spider: | ||
def __init__(self, output_func, output_queue): | ||
self.url_queue = queue.Queue() | ||
self.output_queue = output_queue | ||
self.is_running = True | ||
self.output_func = output_func | ||
|
||
@staticmethod | ||
def crawl(url, output_queue): | ||
# 创建一个新的进程来运行爬虫函数 | ||
p = Process(target=fl.fanqie_l, args=(url, 'utf-8', output_queue)) | ||
p.start() | ||
time.sleep(2) | ||
|
||
def worker(self): | ||
while self.is_running: | ||
try: | ||
url = self.url_queue.get(timeout=1) | ||
Spider.crawl(url, self.output_queue) | ||
self.url_queue.task_done() | ||
except queue.Empty: | ||
continue | ||
|
||
def start(self): | ||
threading.Thread(target=self.worker, daemon=True).start() | ||
|
||
def add_url(self, url): | ||
if "/page/" not in url: | ||
tkinter.messagebox.showinfo("错误", "URL格式不正确,请重新输入") | ||
return | ||
else: | ||
self.url_queue.put(url) | ||
tkinter.messagebox.showinfo("成功", "URL已添加到下载队列") | ||
|
||
def stop(self): | ||
self.is_running = False | ||
|
||
|
||
def main(): | ||
root = tk.Tk() | ||
root.title("番茄工具队列版") | ||
|
||
# 设置窗口大小 | ||
root.geometry("600x400") | ||
|
||
output_text = tk.Text(root, state='disabled') | ||
output_text.pack() | ||
|
||
# 创建滚动条 | ||
scrollbar = tk.Scrollbar(root, command=output_text.yview) | ||
scrollbar.pack(side=tk.RIGHT, fill=tk.Y) | ||
|
||
# 设置Text组件的yscrollcommand为滚动条的set方法 | ||
output_text.config(yscrollcommand=scrollbar.set) | ||
|
||
# 手动调整滚动条的位置 | ||
scrollbar.place(x=580, y=0, height=320) | ||
|
||
input_frame = tk.Frame(root) | ||
input_frame.pack() | ||
input_entry = tk.Entry(input_frame, width=50) | ||
input_entry.pack(side=tk.LEFT) | ||
|
||
# 调整输入框的位置 | ||
input_frame.place(x=50, y=350) | ||
|
||
def paste_text(): | ||
input_entry.event_generate('<<Paste>>') | ||
|
||
def show_context_menu(event): | ||
context_menu.post(event.x_root, event.y_root) | ||
|
||
# 创建上下文菜单 | ||
context_menu = tk.Menu(root, tearoff=0) | ||
context_menu.add_command(label="粘贴", command=paste_text) | ||
|
||
# 绑定上下文菜单到输入框 | ||
input_entry.bind("<Button-3>", show_context_menu) | ||
|
||
input_entry.bind("<Button-3>", show_context_menu) | ||
|
||
output_queue = Queue() | ||
|
||
spider = Spider(lambda text: output_text.config(state='normal') or output_text.insert(tk.END, text + "\n") or output_text.config(state='disabled'), output_queue) | ||
|
||
spider.start() | ||
|
||
def add_url(): | ||
url = input_entry.get() | ||
spider.add_url(url) | ||
input_entry.delete(0, tk.END) | ||
|
||
add_button = tk.Button(input_frame, text="添加URL", command=add_url) | ||
add_button.pack(side=tk.LEFT) | ||
|
||
def list_urls(): | ||
urls = list(spider.url_queue.queue) | ||
tkinter.messagebox.showinfo("队列中的URL", "\n".join(urls)) | ||
|
||
list_button = tk.Button(input_frame, text="列出URL", command=list_urls) | ||
list_button.pack(side=tk.LEFT) | ||
|
||
def stop_spider(): | ||
spider.stop() | ||
root.quit() | ||
|
||
stop_button = tk.Button(input_frame, text="退出", command=stop_spider) | ||
stop_button.pack(side=tk.LEFT) | ||
|
||
def check_output_queue(): | ||
while not output_queue.empty(): | ||
message = output_queue.get() | ||
output_text.config(state='normal') | ||
output_text.insert(tk.END, message + "\n") | ||
output_text.config(state='disabled') | ||
|
||
# 滚动到最后一行 | ||
output_text.see(tk.END) | ||
|
||
root.after(100, check_output_queue) # 每秒检查一次输出队列 | ||
|
||
check_output_queue() | ||
|
||
root.mainloop() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.