Skip to content

Commit

Permalink
fix: webscraper not getting links
Browse files Browse the repository at this point in the history
  • Loading branch information
okradze committed Nov 29, 2023
1 parent 5c15858 commit 8e2ba04
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions apps/server/tools/webscraper/webscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,18 @@ def extract_with_bs4(self, url):
content_tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "a"]
content = " ".join(
[
tag.text.strip()
(tag.text.strip() + " " + tag.get("href", ""))
if tag.name == "a"
else tag.text.strip()
for tag in main_content.find_all(content_tags)
]
)
else:
content = " ".join(
[
tag.text.strip()
(tag.text.strip() + " " + tag.get("href", ""))
if tag.name == "a"
else tag.text.strip()
for tag in soup.find_all(
["p", "h1", "h2", "h3", "h4", "h5", "h6"]
)
Expand Down

0 comments on commit 8e2ba04

Please sign in to comment.