Skip to content

Commit

Permalink
adding puppeteer timeout and fixing waitForSelector
Browse files Browse the repository at this point in the history
  • Loading branch information
Christian Kellner authored and Christian Kellner committed Jan 7, 2025
1 parent 26127ee commit e1db384
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 40 deletions.
4 changes: 2 additions & 2 deletions lib/provider/immonet.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import utils, {buildHash} from '../utils.js';
import utils, { buildHash } from '../utils.js';
let appliedBlackList = [];

/**
Expand Down Expand Up @@ -26,7 +26,7 @@ const config = {
url: null,
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'sortby=19',
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]',
waitForSelector: 'div[data-testid="serp-resultscount-testid"]',
crawlFields: {
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
title: 'button@title |trim',
Expand Down
2 changes: 1 addition & 1 deletion lib/provider/immowelt.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ const config = {
crawlContainer:
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'order=DateDesc',
waitForSelector: 'div[data-testid="cardmfe-price-testid"]',
waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
crawlFields: {
id: 'a@href',
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
Expand Down
1 change: 1 addition & 0 deletions lib/services/extractor/parser/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export function parse(crawlContainer, crawlFields, text, url) {

if ($(crawlContainer).length === 0) {
console.error('No elements in crawl container found for url ', url);
return null;
}

$(crawlContainer).each((_, element) => {
Expand Down
75 changes: 38 additions & 37 deletions lib/services/extractor/puppeteerExtractor.js
Original file line number Diff line number Diff line change
@@ -1,48 +1,49 @@
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import {debug, DEFAULT_HEADER, botDetected} from './utils.js';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';

puppeteer.use(StealthPlugin());

export default async function execute(url, waitForSelector, options) {
let browser;
try {
debug(`Sending request to ${url} using Puppeteer.`);
let browser;
try {
debug(`Sending request to ${url} using Puppeteer.`);

browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox']
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded'
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
pageSource = await page.evaluate(selector => {
return document.querySelector(selector).innerHTML;
}, waitForSelector);
} else {
pageSource = await page.content();
}
browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
timeout: options.puppeteerTimeout || 30_000,
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded',
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
pageSource = await page.evaluate((selector) => {
return document.querySelector(selector).innerHTML;
}, waitForSelector);
} else {
pageSource = await page.content();
}

const statusCode = response.status();
const statusCode = response.status();

if (botDetected(pageSource, statusCode)) {
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
}
if (botDetected(pageSource, statusCode)) {
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
}

return await page.content();
} catch (error) {
console.error('Error executing with puppeteer executor', error);
return null;
} finally {
if (browser != null) {
await browser.close();
}
return await page.content();
} catch (error) {
console.error('Error executing with puppeteer executor', error);
return null;
} finally {
if (browser != null) {
await browser.close();
}
}
}
}

0 comments on commit e1db384

Please sign in to comment.