Skip to content

If there is a dead url, show where it was linked from #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 19 additions & 28 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ export async function fetchSite(
options: Options
): Promise<FetchSiteResult> {
const fetcher = new Fetcher(options)

return fetcher.fetchSite(url)
}

Expand All @@ -34,7 +33,6 @@ class Fetcher {
#getContentSelector(pathname: string) {
if (typeof this.options.contentSelector === "function")
return this.options.contentSelector({ pathname })

return this.options.contentSelector
}

Expand All @@ -45,17 +43,19 @@ class Fetcher {
}`
)

await this.#fetchPage(url, {
// Initial call with no parent URL
await this.#fetchPage(url, undefined, {
skipMatch: true,
})

await this.#queue.onIdle()

return this.#pages
}

// Updated #fetchPage to include parentUrl
async #fetchPage(
url: string,
parentUrl: string | undefined,
options: {
skipMatch?: boolean
}
Expand All @@ -68,8 +68,6 @@ class Fetcher {

this.#fetched.add(pathname)

// return if not matched
// we don't need to extract content for this page
if (
!options.skipMatch &&
this.options.match &&
Expand All @@ -86,8 +84,15 @@ class Fetcher {
},
})

// Enhanced logging with parentUrl
if (!res.ok) {
logger.warn(`Failed to fetch ${url}: ${res.statusText}`)
if (parentUrl) {
logger.warn(
`Failed to fetch ${url} (linked from ${parentUrl}): ${res.statusText}`
)
} else {
logger.warn(`Failed to fetch ${url}: ${res.statusText}`)
}
return
}

Expand All @@ -96,47 +101,39 @@ class Fetcher {
}

const contentType = res.headers.get("content-type")

if (!contentType?.includes("text/html")) {
logger.warn(`Not a HTML page: ${url}`)
return
}

const resUrl = new URL(res.url)

// redirected to other site, ignore
if (resUrl.host !== host) {
logger.warn(`Redirected from ${host} to ${resUrl.host}`)
return
}
const extraUrls: string[] = []

const extraUrls: string[] = []
const $ = load(await res.text())
$("script,style,link,img,video").remove()

$("a").each((_, el) => {
const href = $(el).attr("href")

if (!href) {
return
}
if (!href) return

try {
const thisUrl = new URL(href, url)
if (thisUrl.host !== host) {
return
}

if (thisUrl.host !== host) return
extraUrls.push(thisUrl.href)
} catch {
logger.warn(`Failed to parse URL: ${href}`)
}
})

// Pass current url as parentUrl for child URLs
if (extraUrls.length > 0) {
for (const url of extraUrls) {
for (const childUrl of extraUrls) {
this.#queue.add(() =>
this.#fetchPage(url, { ...options, skipMatch: false })
this.#fetchPage(childUrl, url, { ...options, skipMatch: false })
)
}
}
Expand All @@ -162,19 +159,14 @@ class Fetcher {
}

window.document.write(html)

await window.happyDOM.waitUntilComplete()

const article = new Readability(window.document as any).parse()

await window.happyDOM.close()

if (!article) {
return
}
if (!article) return

const content = toMarkdown(article.content)

this.#pages.set(pathname, {
title: article.title || pageTitle,
url,
Expand All @@ -190,7 +182,6 @@ export function serializePages(
if (format === "json") {
return JSON.stringify([...pages.values()])
}

return [...pages.values()]
.map((page) =>
`<page>
Expand Down