This repository has been archived by the owner on Mar 22, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 77
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
386 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
""" | ||
作者:星隅(xing-yv) | ||
版权所有(C)2023 星隅(xing-yv) | ||
本软件根据GNU通用公共许可证第三版(GPLv3)发布; | ||
你可以在以下位置找到该许可证的副本: | ||
https://www.gnu.org/licenses/gpl-3.0.html | ||
根据GPLv3的规定,您有权在遵循许可证的前提下自由使用、修改和分发本软件。 | ||
请注意,根据许可证的要求,任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。 | ||
本软件提供的是按"原样"提供的,没有任何明示或暗示的保证,包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内,作者明确放弃了所有明示或暗示的担保和条件。 | ||
免责声明: | ||
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。 | ||
请在使用本程序之前确保遵守相关法律法规和网站的使用政策,如有疑问,请咨询法律顾问。 | ||
无论您对程序进行了任何操作,请始终保留此信息。 | ||
""" | ||
import os | ||
|
||
# 导入必要的模块 | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urljoin | ||
import re | ||
import datetime | ||
from os import path | ||
import time | ||
import public as p | ||
|
||
|
||
# 定义正常模式用来下载番茄小说的函数 | ||
def fanqie_l(url, encoding, output_queue): | ||
|
||
headers = { | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" | ||
} | ||
|
||
# 获取网页源码 | ||
response = requests.get(url, headers=headers) | ||
html = response.text | ||
|
||
# 解析网页源码 | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
# 获取小说标题 | ||
title = soup.find("h1").get_text() | ||
# , class_ = "info-name" | ||
# 替换非法字符 | ||
title = p.rename(title) | ||
|
||
output_queue.put(f"正在获取: {title}") | ||
|
||
# 获取小说信息 | ||
info = soup.find("div", class_="page-header-info").get_text() | ||
|
||
# 获取小说简介 | ||
intro = soup.find("div", class_="page-abstract-content").get_text() | ||
|
||
# 拼接小说内容字符串 | ||
content = f"""如果需要小说更新,请勿修改文件名 | ||
使用 @星隅(xing-yv) 所作开源工具下载 | ||
开源仓库地址:https://github.com/xing-yv/fanqie-novel-download | ||
Gitee:https://gitee.com/xingyv1024/fanqie-novel-download/ | ||
任何人无权限制您访问本工具,如果有向您提供代下载服务者未事先告知您工具的获取方式,请向作者举报:[email protected] | ||
{title} | ||
{info} | ||
{intro} | ||
""" | ||
|
||
# 获取所有章节链接 | ||
chapters = soup.find_all("div", class_="chapter-item") | ||
|
||
# 定义文件名 | ||
file_path = path.join('output', f'{title}.txt') | ||
|
||
os.makedirs("output", exist_ok=True) | ||
|
||
try: | ||
# 遍历每个章节链接 | ||
for chapter in chapters: | ||
time.sleep(1) | ||
# 获取章节标题 | ||
chapter_title = chapter.find("a").get_text() | ||
|
||
# 获取章节网址 | ||
chapter_url = urljoin(url, chapter.find("a")["href"]) | ||
|
||
# 获取章节 id | ||
chapter_id = re.search(r"/(\d+)", chapter_url).group(1) | ||
|
||
# 构造 api 网址 | ||
api_url = f"https://novel.snssdk.com/api/novel/book/reader/full/v1/?device_platform=android&parent_enterfrom=novel_channel_search.tab.&aid=2329&platform_id=1&group_id={chapter_id}&item_id={chapter_id}" | ||
|
||
# 尝试获取章节内容 | ||
chapter_content = None | ||
retry_count = 1 | ||
while retry_count < 4: # 设置最大重试次数 | ||
# 获取 api 响应 | ||
api_response = requests.get(api_url, headers=headers) | ||
|
||
# 解析 api 响应为 json 数据 | ||
api_data = api_response.json() | ||
|
||
if "data" in api_data and "content" in api_data["data"]: | ||
chapter_content = api_data["data"]["content"] | ||
break # 如果成功获取章节内容,跳出重试循环 | ||
else: | ||
if retry_count == 1: | ||
output_queue.put(f"{chapter_title} 获取失败,正在尝试重试...") | ||
output_queue.put(f"第 ({retry_count}/3) 次重试获取章节内容") | ||
retry_count += 1 # 否则重试 | ||
|
||
if retry_count == 4: | ||
output_queue.put(f"无法获取章节内容: {chapter_title},跳过。") | ||
continue # 重试次数过多后,跳过当前章节 | ||
|
||
# 提取文章标签中的文本 | ||
chapter_text = re.search(r"<article>([\s\S]*?)</article>", chapter_content).group(1) | ||
|
||
# 将 <p> 标签替换为换行符 | ||
chapter_text = re.sub(r"<p>", "\n", chapter_text) | ||
|
||
# 去除其他 html 标签 | ||
chapter_text = re.sub(r"</?\w+>", "", chapter_text) | ||
|
||
chapter_text = p.fix_publisher(chapter_text) | ||
|
||
# 在小说内容字符串中添加章节标题和内容 | ||
content += f"\n\n\n{chapter_title}\n{chapter_text}" | ||
|
||
# 打印进度信息 | ||
output_queue.put(f"已获取 {chapter_title}") | ||
|
||
# 根据编码转换小说内容字符串为二进制数据 | ||
data = content.encode(encoding, errors='ignore') | ||
|
||
# 保存文件 | ||
with open(file_path, "wb") as f: | ||
f.write(data) | ||
|
||
# 打印完成信息 | ||
output_queue.put(f"已保存{title}.txt") | ||
|
||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
|
||
output_queue.put(f"完成时间:{current_time}") | ||
|
||
except Exception as e: | ||
# 捕获所有异常,及时保存文件 | ||
output_queue.put(f"发生异常: \n{e}") | ||
output_queue.put("正在尝试保存文件...") | ||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
output_queue.put(f"{current_time}") | ||
# 根据转换小说内容字符串为二进制数据 | ||
data = content.encode(encoding, errors='ignore') | ||
|
||
# 保存文件 | ||
with open(file_path, "wb") as f: | ||
f.write(data) | ||
|
||
output_queue.put("文件已保存!") | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
""" | ||
作者:星隅(xing-yv) | ||
版权所有(C)2023 星隅(xing-yv) | ||
本软件根据GNU通用公共许可证第三版(GPLv3)发布; | ||
你可以在以下位置找到该许可证的副本: | ||
https://www.gnu.org/licenses/gpl-3.0.html | ||
根据GPLv3的规定,您有权在遵循许可证的前提下自由使用、修改和分发本软件。 | ||
请注意,根据许可证的要求,任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。 | ||
本软件提供的是按"原样"提供的,没有任何明示或暗示的保证,包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内,作者明确放弃了所有明示或暗示的担保和条件。 | ||
免责声明: | ||
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。 | ||
请在使用本程序之前确保遵守相关法律法规和网站的使用政策,如有疑问,请咨询法律顾问。 | ||
无论您对程序进行了任何操作,请始终保留此信息。 | ||
""" | ||
|
||
import queue | ||
import threading | ||
import tkinter as tk | ||
import tkinter.messagebox | ||
from multiprocessing import Process, Queue | ||
import time | ||
import fanqie_list as fl | ||
|
||
|
||
class Spider: | ||
def __init__(self, output_func, output_queue): | ||
self.url_queue = queue.Queue() | ||
self.output_queue = output_queue | ||
self.is_running = True | ||
self.output_func = output_func | ||
|
||
@staticmethod | ||
def crawl(url, output_queue): | ||
# 创建一个新的进程来运行爬虫函数 | ||
p = Process(target=fl.fanqie_l, args=(url, 'utf-8', output_queue)) | ||
p.start() | ||
time.sleep(2) | ||
|
||
def worker(self): | ||
while self.is_running: | ||
try: | ||
url = self.url_queue.get(timeout=1) | ||
Spider.crawl(url, self.output_queue) | ||
self.url_queue.task_done() | ||
except queue.Empty: | ||
continue | ||
|
||
def start(self): | ||
threading.Thread(target=self.worker, daemon=True).start() | ||
|
||
def add_url(self, url): | ||
if "/page/" not in url: | ||
tkinter.messagebox.showinfo("错误", "URL格式不正确,请重新输入") | ||
return | ||
else: | ||
self.url_queue.put(url) | ||
tkinter.messagebox.showinfo("成功", "URL已添加到下载队列") | ||
|
||
def stop(self): | ||
self.is_running = False | ||
|
||
|
||
def main(): | ||
root = tk.Tk() | ||
root.title("番茄工具队列版") | ||
|
||
# 设置窗口大小 | ||
root.geometry("600x400") | ||
|
||
output_text = tk.Text(root, state='disabled') | ||
output_text.pack() | ||
|
||
# 创建滚动条 | ||
scrollbar = tk.Scrollbar(root, command=output_text.yview) | ||
scrollbar.pack(side=tk.RIGHT, fill=tk.Y) | ||
|
||
# 设置Text组件的yscrollcommand为滚动条的set方法 | ||
output_text.config(yscrollcommand=scrollbar.set) | ||
|
||
# 手动调整滚动条的位置 | ||
scrollbar.place(x=580, y=0, height=320) | ||
|
||
input_frame = tk.Frame(root) | ||
input_frame.pack() | ||
input_entry = tk.Entry(input_frame, width=50) | ||
input_entry.pack(side=tk.LEFT) | ||
|
||
# 调整输入框的位置 | ||
input_frame.place(x=50, y=350) | ||
|
||
def paste_text(): | ||
input_entry.event_generate('<<Paste>>') | ||
|
||
def show_context_menu(event): | ||
context_menu.post(event.x_root, event.y_root) | ||
|
||
# 创建上下文菜单 | ||
context_menu = tk.Menu(root, tearoff=0) | ||
context_menu.add_command(label="粘贴", command=paste_text) | ||
|
||
# 绑定上下文菜单到输入框 | ||
input_entry.bind("<Button-3>", show_context_menu) | ||
|
||
input_entry.bind("<Button-3>", show_context_menu) | ||
|
||
output_queue = Queue() | ||
|
||
spider = Spider(lambda text: output_text.config(state='normal') or output_text.insert(tk.END, text + "\n") or output_text.config(state='disabled'), output_queue) | ||
|
||
spider.start() | ||
|
||
def add_url(): | ||
url = input_entry.get() | ||
spider.add_url(url) | ||
input_entry.delete(0, tk.END) | ||
|
||
add_button = tk.Button(input_frame, text="添加URL", command=add_url) | ||
add_button.pack(side=tk.LEFT) | ||
|
||
def list_urls(): | ||
urls = list(spider.url_queue.queue) | ||
tkinter.messagebox.showinfo("队列中的URL", "\n".join(urls)) | ||
|
||
list_button = tk.Button(input_frame, text="列出URL", command=list_urls) | ||
list_button.pack(side=tk.LEFT) | ||
|
||
def stop_spider(): | ||
spider.stop() | ||
root.quit() | ||
|
||
stop_button = tk.Button(input_frame, text="退出", command=stop_spider) | ||
stop_button.pack(side=tk.LEFT) | ||
|
||
def check_output_queue(): | ||
while not output_queue.empty(): | ||
message = output_queue.get() | ||
output_text.config(state='normal') | ||
output_text.insert(tk.END, message + "\n") | ||
output_text.config(state='disabled') | ||
|
||
# 滚动到最后一行 | ||
output_text.see(tk.END) | ||
|
||
root.after(100, check_output_queue) # 每秒检查一次输出队列 | ||
|
||
check_output_queue() | ||
|
||
root.mainloop() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
""" | ||
作者:星隅(xing-yv) | ||
版权所有(C)2023 星隅(xing-yv) | ||
本软件根据GNU通用公共许可证第三版(GPLv3)发布; | ||
你可以在以下位置找到该许可证的副本: | ||
https://www.gnu.org/licenses/gpl-3.0.html | ||
根据GPLv3的规定,您有权在遵循许可证的前提下自由使用、修改和分发本软件。 | ||
请注意,根据许可证的要求,任何对本软件的修改和分发都必须包括原始的版权声明和GPLv3的完整文本。 | ||
本软件提供的是按"原样"提供的,没有任何明示或暗示的保证,包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内,作者明确放弃了所有明示或暗示的担保和条件。 | ||
免责声明: | ||
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。 | ||
请在使用本程序之前确保遵守相关法律法规和网站的使用政策,如有疑问,请咨询法律顾问。 | ||
无论您对程序进行了任何操作,请始终保留此信息。 | ||
""" | ||
|
||
import re | ||
|
||
|
||
# 替换非法字符 | ||
def rename(name): | ||
# 定义非法字符的正则表达式模式 | ||
illegal_characters_pattern = r'[\/:*?"<>|]' | ||
|
||
# 定义替换的中文符号 | ||
replacement_dict = { | ||
'/': '/', | ||
':': ':', | ||
'*': '*', | ||
'?': '?', | ||
'"': '“', | ||
'<': '<', | ||
'>': '>', | ||
'|': '|' | ||
} | ||
|
||
# 使用正则表达式替换非法字符 | ||
sanitized_path = re.sub(illegal_characters_pattern, lambda x: replacement_dict[x.group(0)], name) | ||
|
||
return sanitized_path | ||
|
||
|
||
def fix_publisher(text): | ||
# 针对性去除所有 出版物 所携带的标签 | ||
text = re.sub(r'<p class=".*?">', '', text) | ||
text = re.sub(r'<!--\?xml.*?>', '', text) | ||
text = re.sub(r'<link .*?/>', '', text) | ||
text = re.sub(r'<meta .*?/>', '', text) | ||
text = re.sub(r'<h1 .*?>', '', text) | ||
text = re.sub(r'<br/>', '', text) | ||
text = re.sub(r'<!DOCTYPE html .*?>', '', text) | ||
text = re.sub(r'<span .*?>', '', text) | ||
text = re.sub(r'<html .*?>', '', text) | ||
return text |