diff --git a/src/index.ts b/src/index.ts index bfc614d..4dc51ce 100644 --- a/src/index.ts +++ b/src/index.ts @@ -13,7 +13,6 @@ export async function fetchSite( options: Options ): Promise { const fetcher = new Fetcher(options) - return fetcher.fetchSite(url) } @@ -34,7 +33,6 @@ class Fetcher { #getContentSelector(pathname: string) { if (typeof this.options.contentSelector === "function") return this.options.contentSelector({ pathname }) - return this.options.contentSelector } @@ -45,17 +43,19 @@ class Fetcher { }` ) - await this.#fetchPage(url, { + // Initial call with no parent URL + await this.#fetchPage(url, undefined, { skipMatch: true, }) await this.#queue.onIdle() - return this.#pages } + // Updated #fetchPage to include parentUrl async #fetchPage( url: string, + parentUrl: string | undefined, options: { skipMatch?: boolean } @@ -68,8 +68,6 @@ class Fetcher { this.#fetched.add(pathname) - // return if not matched - // we don't need to extract content for this page if ( !options.skipMatch && this.options.match && @@ -86,8 +84,15 @@ class Fetcher { }, }) + // Enhanced logging with parentUrl if (!res.ok) { - logger.warn(`Failed to fetch ${url}: ${res.statusText}`) + if (parentUrl) { + logger.warn( + `Failed to fetch ${url} (linked from ${parentUrl}): ${res.statusText}` + ) + } else { + logger.warn(`Failed to fetch ${url}: ${res.statusText}`) + } return } @@ -96,47 +101,39 @@ class Fetcher { } const contentType = res.headers.get("content-type") - if (!contentType?.includes("text/html")) { logger.warn(`Not a HTML page: ${url}`) return } const resUrl = new URL(res.url) - - // redirected to other site, ignore if (resUrl.host !== host) { logger.warn(`Redirected from ${host} to ${resUrl.host}`) return } - const extraUrls: string[] = [] + const extraUrls: string[] = [] const $ = load(await res.text()) $("script,style,link,img,video").remove() $("a").each((_, el) => { const href = $(el).attr("href") - - if (!href) { - return - } + if (!href) return try { const thisUrl = new URL(href, url) - if (thisUrl.host !== host) { - return - } - + if (thisUrl.host !== host) return extraUrls.push(thisUrl.href) } catch { logger.warn(`Failed to parse URL: ${href}`) } }) + // Pass current url as parentUrl for child URLs if (extraUrls.length > 0) { - for (const url of extraUrls) { + for (const childUrl of extraUrls) { this.#queue.add(() => - this.#fetchPage(url, { ...options, skipMatch: false }) + this.#fetchPage(childUrl, url, { ...options, skipMatch: false }) ) } } @@ -162,19 +159,14 @@ class Fetcher { } window.document.write(html) - await window.happyDOM.waitUntilComplete() const article = new Readability(window.document as any).parse() - await window.happyDOM.close() - if (!article) { - return - } + if (!article) return const content = toMarkdown(article.content) - this.#pages.set(pathname, { title: article.title || pageTitle, url, @@ -190,7 +182,6 @@ export function serializePages( if (format === "json") { return JSON.stringify([...pages.values()]) } - return [...pages.values()] .map((page) => `