Skip to content
This repository has been archived by the owner on Mar 22, 2024. It is now read-only.

Commit

Permalink
Merge pull request #6 from weiwei-cool/main
Browse files Browse the repository at this point in the history
初步解决之前提交的格式错误issues
  • Loading branch information
shing-yu authored Oct 14, 2023
2 parents 3e0dcde + 200135f commit 6e3e1b7
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/fanqie_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ def download_novels(url, encoding, user_agent, path_choice, folder_path, data_fo
# 去除其他 html 标签
chapter_text = re.sub(r"</?\w+>", "", chapter_text)

# 去除带class的p标签
# TODO 格式乱码
chapter_text = re.sub(r'<p class=".*?">', '', chapter_text)

# 在小说内容字符串中添加章节标题和内容
content += f"\n\n\n{chapter_title}\n{chapter_text}"

Expand Down
4 changes: 4 additions & 0 deletions src/fanqie_chapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ def fanqie_c(url, encoding, user_agent, path_choice, start_chapter_id):
# 去除其他 html 标签
chapter_text = re.sub(r"</?\w+>", "", chapter_text)

# 去除带class的p标签
# TODO 格式乱码
chapter_text = re.sub(r'<p class=".*?">', '', chapter_text)

# 在章节内容字符串中添加章节标题和内容
content_all = f"{chapter_title}\n{chapter_text}"

Expand Down
4 changes: 4 additions & 0 deletions src/fanqie_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ def fanqie_d(url, encoding, user_agent, path_choice, data_folder, start_chapter_
# 去除其他 html 标签
chapter_text = re.sub(r"</?\w+>", "", chapter_text)

# 去除带class的p标签
# TODO 格式乱码
chapter_text = re.sub(r'<p class=".*?">', '', chapter_text)

# 在小说内容字符串中添加章节标题和内容
content += f"\n\n\n{chapter_title}\n{chapter_text}"

Expand Down
4 changes: 4 additions & 0 deletions src/fanqie_normal.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ def fanqie_n(url, encoding, user_agent, path_choice, data_folder, start_chapter_
# 去除其他 html 标签
chapter_text = re.sub(r"</?\w+>", "", chapter_text)

# 去除带class的p标签
# TODO 格式乱码
chapter_text = re.sub(r'<p class=".*?">', '', chapter_text)

# 在小说内容字符串中添加章节标题和内容
content += f"\n\n\n{chapter_title}\n{chapter_text}"

Expand Down

0 comments on commit 6e3e1b7

Please sign in to comment.