1const Apify = require('apify');
2const _ = require('underscore');
3const httpProxy = require('http-proxy');
4const url = require('url');
5const contentType = require('content-type');
6const {
7 tools,
8 browserTools,
9 constants: { META_KEY, DEFAULT_VIEWPORT, DEVTOOLS_TIMEOUT_SECS },
10} = require('@apify/scraper-tools');
11const rp = require('request-promise');
12
13const GlobalStore = require('./global_store');
14const createBundle = require('./bundle.browser');
15const SCHEMA = require('../INPUT_SCHEMA');
16
17const CHROME_DEBUGGER_PORT = 4322;
18
19const { utils: { log, puppeteer } } = Apify;
20
21/**
22 * Replicates the INPUT_SCHEMA with JavaScript types for quick reference
23 * and IDE type check integration.
24 *
25 * @typedef {Object} Input
26 * @property {Object[]} startUrls
27 * @property {boolean} useRequestQueue
28 * @property {Object[]} pseudoUrls
29 * @property {string} linkSelector
30 * @property {boolean} keepUrlFragments
31 * @property {string} pageFunction
32 * @property {Object} proxyConfiguration
33 * @property {boolean} debugLog
34 * @property {boolean} browserLog
35 * @property {boolean} injectJQuery
36 * @property {boolean} injectUnderscore
37 * @property {boolean} downloadMedia
38 * @property {boolean} downloadCss
39 * @property {number} maxRequestRetries
40 * @property {number} maxPagesPerCrawl
41 * @property {number} maxResultsPerCrawl
42 * @property {number} maxCrawlingDepth
43 * @property {number} maxConcurrency
44 * @property {number} pageLoadTimeoutSecs
45 * @property {number} pageFunctionTimeoutSecs
46 * @property {Object} customData
47 * @property {Array} initialCookies
48 * @property {Array} waitUntil
49 * @property {boolean} useChrome
50 * @property {boolean} useStealth
51 * @property {boolean} ignoreCorsAndCsp
52 * @property {boolean} ignoreSslErrors
53 */
54
55/**
56 * Holds all the information necessary for constructing a crawler
57 * instance and creating a context for a pageFunction invocation.
58 */
59class CrawlerSetup {
60 /* eslint-disable class-methods-use-this */
61 constructor(input) {
62 this.name = 'Web Scraper';
63 // Set log level early to prevent missed messages.
64 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
65
66 // Keep this as string to be immutable.
67 this.rawInput = JSON.stringify(input);
68
69 // Attempt to load page function from disk if not present on input.
70 tools.maybeLoadPageFunctionFromDisk(input, __dirname);
71
72 // Validate INPUT if not running on Apify Cloud Platform.
73 if (!Apify.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
74
75 /**
76 * @type {Input}
77 */
78 this.input = input;
79 this.env = Apify.getEnv();
80
81 // Validations
82 if (this.input.pseudoUrls.length && !this.input.useRequestQueue) {
83 throw new Error('Cannot enqueue links using Pseudo URLs without using a Request Queue. '
84 + 'Either select the "Use Request Queue" option to enable Request Queue or '
85 + 'remove your Pseudo URLs.');
86 }
87 this.input.pseudoUrls.forEach((purl) => {
88 if (!tools.isPlainObject(purl)) throw new Error('The pseudoUrls Array must only contain Objects.');
89 if (purl.userData && !tools.isPlainObject(purl.userData)) throw new Error('The userData property of a pseudoUrl must be an Object.');
90 });
91 this.input.initialCookies.forEach((cookie) => {
92 if (!tools.isPlainObject(cookie)) throw new Error('The initialCookies Array must only contain Objects.');
93 });
94 this.input.waitUntil.forEach((event) => {
95 if (!/^(domcontentloaded|load|networkidle2|networkidle0)$/.test(event)) {
96 throw new Error('Navigation wait until events must be valid. See tooltip.');
97 }
98 });
99 tools.evalFunctionOrThrow(this.input.pageFunction);
100
101 // Used to store page specific data.
102 this.pageContexts = new WeakMap();
103
104 // Used to store data that persist navigations
105 this.globalStore = new GlobalStore();
106
107 // Excluded resources
108 this.blockedUrlPatterns = [];
109 if (!this.input.downloadMedia) {
110 this.blockedUrlPatterns = [...this.blockedUrlPatterns,
111 '.jpg', '.jpeg', '.png', '.svg', '.gif', '.webp', '.webm', '.ico', '.woff', '.eot',
112 ];
113 }
114 if (!this.input.downloadCss) this.blockedUrlPatterns.push('.css');
115
116 // Start Chromium with Debugger any time the page function includes the keyword.
117 this.devtools = this.input.pageFunction.includes('debugger;');
118
119 // Initialize async operations.
120 this.crawler = null;
121 this.requestList = null;
122 this.requestQueue = null;
123 this.dataset = null;
124 this.keyValueStore = null;
125 this.proxy = null;
126 this.initPromise = this._initializeAsync();
127 }
128
129 async _initializeAsync() {
130 // RequestList
131 const startUrls = this.input.startUrls.map((req) => {
132 req.useExtendedUniqueKey = true;
133 req.keepUrlFragment = this.input.keepUrlFragments;
134 return req;
135 });
136 this.requestList = await Apify.openRequestList('WEB_SCRAPER', startUrls);
137
138 // RequestQueue if selected
139 if (this.input.useRequestQueue) this.requestQueue = await Apify.openRequestQueue();
140
141 // Dataset
142 this.dataset = await Apify.openDataset();
143 const { itemsCount } = await this.dataset.getInfo();
144 this.pagesOutputted = itemsCount || 0;
145
146 // KeyValueStore
147 this.keyValueStore = await Apify.openKeyValueStore();
148 }
149
150 /**
151 * Resolves to a `PuppeteerCrawler` instance.
152 * constructor.
153 * @returns {Promise<PuppeteerCrawler>}
154 */
155 async createCrawler() {
156 await this.initPromise;
157
158 const args = [];
159 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
160
161 const options = {
162 handlePageFunction: this._handlePageFunction.bind(this),
163 requestList: this.requestList,
164 requestQueue: this.requestQueue,
165 handlePageTimeoutSecs: this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageFunctionTimeoutSecs,
166 gotoFunction: this._gotoFunction.bind(this),
167 handleFailedRequestFunction: this._handleFailedRequestFunction.bind(this),
168 maxConcurrency: this.input.maxConcurrency,
169 maxRequestRetries: this.input.maxRequestRetries,
170 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
171 proxyUrls: this.input.proxyConfiguration.proxyUrls,
172 // launchPuppeteerFunction: use default,
173 puppeteerPoolOptions: {
174 useLiveView: true,
175 recycleDiskCache: true,
176 },
177 launchPuppeteerOptions: {
178 ...(_.omit(this.input.proxyConfiguration, 'proxyUrls')),
179 ignoreHTTPSErrors: this.input.ignoreSslErrors,
180 defaultViewport: DEFAULT_VIEWPORT,
181 devtools: this.devtools,
182 headless: false,
183 useChrome: this.input.useChrome,
184 stealth: this.input.useStealth,
185 args,
186 },
187 };
188
189 if (this.input.chromeDebugger) {
190 log.info(`Starting scraper with Chrome debugger attached:
191 - Scraper will run with concurrency 1.
192 - The whole run will be performed with a single browser (= 1 proxy IP).
193 - You can place "debugger;" to you page function to set up a break point.`);
194
195 options.puppeteerPoolOptions.useLiveView = false;
196 options.puppeteerPoolOptions.retireInstanceAfterRequestCount = 1000000;
197 options.maxConcurrency = 1;
198 options.launchPuppeteerFunction = async (launchPuppeteerOptions) => {
199 launchPuppeteerOptions.args = [`--remote-debugging-port=${CHROME_DEBUGGER_PORT}`, '--user-data-dir=remote-profile'];
200 launchPuppeteerOptions.defaultViewport = { width: 720, height: 1024 };
201
202 const browser = await Apify.launchPuppeteer(launchPuppeteerOptions);
203 const containerHost = url.parse(process.env.APIFY_CONTAINER_URL).host;
204
205 // Start HTTP proxy from live view port to Chrome debugger port.
206 if (this.proxy) proxy.close();
207 this.proxy = httpProxy.createServer({
208 target: {
209 host: 'localhost',
210 port: CHROME_DEBUGGER_PORT,
211 },
212 ws: true,
213 }).listen(4321);
214
215 // Fetches debugger URL of page we are scraping.
216 const debuggerUrlPromise = (async () => {
217 while (true) {
218 const pageList = await rp({ url: `http://localhost:${CHROME_DEBUGGER_PORT}/json/list`, json: true });
219
220 // All the internal debugger pages start with something like "devtools://"...
221 const debuggerPage = pageList.filter(page => page.url.startsWith('http')).pop();
222 if (debuggerPage) return debuggerPage;
223
224 await Apify.utils.sleep(1000);
225 }
226 })();
227
228 // Intercept requests to HTTP proxy.
229 this.proxy.on('proxyReq', async (proxyReq, req, res) => {
230 // We need Chrome to think that it's on localhost otherwise it throws an error...
231 proxyReq.setHeader('Host', 'localhost');
232
233 // URL of debugger of a page we are scraping.
234 let targetUrl = req.url;
235 if (targetUrl === '/') {
236 proxyReq.socket.pause(); // We need to pause it to be able to perform async operations.
237 targetUrl = (await debuggerUrlPromise).devtoolsFrontendUrl; // This one includes ws=localhost so redirect below will be performed
238 targetUrl = targetUrl.replace(`:${CHROME_DEBUGGER_PORT}`, '');
239 proxyReq.socket.resume();
240 }
241
242 // ...but then Chrome things it's running on localhost so we need to change websocket
243 // host in URL to be correct.
244 if (targetUrl.includes('ws=localhost')) {
245 const newUrl = targetUrl.replace('ws=localhost', `wss=${containerHost}`);
246 res.writeHead(301, { Location: newUrl });
247 res.end();
248 }
249 });
250
251 return browser;
252 };
253 }
254
255 this.crawler = new Apify.PuppeteerCrawler(options);
256
257 if (this.input.chromeDebugger) {
258 // 🤷 this.puppeteerPool is set ip in crawler.run() which is called somewhere??
259 const interval = setInterval(() => {
260 if (this.crawler.puppeteerPool) {
261 this.crawler.puppeteerPool.reusePages = true;
262 clearInterval(interval);
263 }
264 }, 10);
265 }
266
267 return this.crawler;
268 }
269
270 async _gotoFunction({ request, page }) {
271 const start = process.hrtime();
272
273 // Create a new page context with a new random key for Apify namespace.
274 const pageContext = {
275 apifyNamespace: await tools.createRandomHash(),
276 skipLinks: false,
277 };
278 this.pageContexts.set(page, pageContext);
279
280 // Attach a console listener to get all logs as soon as possible.
281 if (this.input.browserLog) browserTools.dumpConsole(page);
282
283 // Prevent download of stylesheets and media, unless selected otherwise
284 if (this.blockedUrlPatterns.length) {
285 await puppeteer.blockRequests(page, {
286 urlPatterns: this.blockedUrlPatterns,
287 });
288 }
289
290 // Add initial cookies, if any.
291 if (this.input.initialCookies.length) await page.setCookie(...this.input.initialCookies);
292
293 // Disable content security policy.
294 if (this.input.ignoreCorsAndCsp) await page.setBypassCSP(true);
295
296 tools.logPerformance(request, 'gotoFunction INIT', start);
297 const handleStart = process.hrtime();
298 pageContext.browserHandles = await this._injectBrowserHandles(page, pageContext);
299 tools.logPerformance(request, 'gotoFunction INJECTION HANDLES', handleStart);
300
301 const evalStart = process.hrtime();
302 await Promise.all([
303 page.evaluateOnNewDocument(createBundle, pageContext.apifyNamespace),
304 page.evaluateOnNewDocument(browserTools.wrapPageFunction(this.input.pageFunction, pageContext.apifyNamespace)),
305 ]);
306 tools.logPerformance(request, 'gotoFunction INJECTION EVAL', evalStart);
307
308
309 // Invoke navigation.
310 const navStart = process.hrtime();
311 const response = await puppeteer.gotoExtended(page, request, {
312 timeout: (this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageLoadTimeoutSecs) * 1000,
313 waitUntil: this.input.waitUntil,
314 });
315 await this._waitForLoadEventWhenXml(page, response);
316 tools.logPerformance(request, 'gotoFunction NAVIGATION', navStart);
317
318 const delayStart = process.hrtime();
319 await this._assertNamespace(page, pageContext.apifyNamespace);
320
321 // Inject selected libraries
322 if (this.input.injectJQuery) await puppeteer.injectJQuery(page);
323 if (this.input.injectUnderscore) await puppeteer.injectUnderscore(page);
324
325 tools.logPerformance(request, 'gotoFunction INJECTION DELAY', delayStart);
326 tools.logPerformance(request, 'gotoFunction EXECUTION', start);
327 return response;
328 }
329
330 _handleFailedRequestFunction({ request }) {
331 const lastError = request.errorMessages[request.errorMessages.length - 1];
332 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
333 log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
334 return this._handleResult(request, {}, null, true);
335 }
336
337 /**
338 * First of all, it initializes the state that is exposed to the user via
339 * `pageFunction` context.
340 *
341 * Then it invokes the user provided `pageFunction` with the prescribed context
342 * and saves its return value.
343 *
344 * Finally, it makes decisions based on the current state and post-processes
345 * the data returned from the `pageFunction`.
346 * @param {Object} environment
347 * @returns {Function}
348 */
349 async _handlePageFunction({ request, response, page, autoscaledPool }) {
350 const start = process.hrtime();
351
352 const pageContext = this.pageContexts.get(page);
353
354 /**
355 * PRE-PROCESSING
356 */
357 // Make sure that an object containing internal metadata
358 // is present on every request.
359 tools.ensureMetaData(request);
360
361 // Abort the crawler if the maximum number of results was reached.
362 const aborted = await this._handleMaxResultsPerCrawl(autoscaledPool);
363 if (aborted) return;
364
365 // Setup Context and pass the configuration down to Browser.
366 const contextOptions = {
367 crawlerSetup: Object.assign(
368 _.pick(this, ['rawInput', 'env']),
369 _.pick(this.input, ['customData', 'useRequestQueue', 'injectJQuery', 'injectUnderscore']),
370 { META_KEY },
371 ),
372 browserHandles: pageContext.browserHandles,
373 pageFunctionArguments: {
374 request,
375 response: {
376 status: response && response.status(),
377 headers: response && response.headers(),
378 },
379 },
380 };
381
382 /**
383 * USER FUNCTION EXECUTION
384 */
385 tools.logPerformance(request, 'handlePageFunction PREPROCESSING', start);
386 const startUserFn = process.hrtime();
387
388 const namespace = pageContext.apifyNamespace;
389 const output = await page.evaluate(async (ctxOpts, namespc) => {
390 /* eslint-disable no-shadow */
391 const context = window[namespc].createContext(ctxOpts);
392 const output = {};
393 try {
394 output.pageFunctionResult = await window[namespc].pageFunction(context);
395 } catch (err) {
396 output.pageFunctionError = Object.getOwnPropertyNames(err)
397 .reduce((memo, name) => {
398 memo[name] = err[name];
399 return memo;
400 }, {});
401 }
402 // This needs to be added after pageFunction has run.
403 output.requestFromBrowser = context.request;
404
405 /**
406 * Since Dates cannot travel back to Node and Puppeteer does not use .toJSON
407 * to stringify, they come back as empty objects. We could use JSON.stringify
408 * ourselves, but that exposes us to overridden .toJSON in the target websites.
409 * This hack is not ideal, but it avoids both problems.
410 */
411 function replaceAllDatesInObjectWithISOStrings(obj) {
412 for (const [key, value] of Object.entries(obj)) {
413 if (value instanceof Date && typeof value.toISOString === 'function') {
414 obj[key] = value.toISOString();
415 } else if (value && typeof value === 'object') {
416 replaceAllDatesInObjectWithISOStrings(value);
417 }
418 }
419 return obj;
420 }
421
422 return replaceAllDatesInObjectWithISOStrings(output);
423 }, contextOptions, namespace);
424
425 tools.logPerformance(request, 'handlePageFunction USER FUNCTION', startUserFn);
426 const finishUserFn = process.hrtime();
427
428 /**
429 * POST-PROCESSING
430 */
431 const { pageFunctionResult, requestFromBrowser, pageFunctionError } = output;
432 // Merge requestFromBrowser into request to preserve modifications that
433 // may have been made in browser context.
434 Object.assign(request, requestFromBrowser);
435
436 // Throw error from pageFunction, if any.
437 if (pageFunctionError) throw tools.createError(pageFunctionError);
438
439 // Enqueue more links if a link selector is available,
440 // unless the user invoked the `skipLinks()` context function
441 // or maxCrawlingDepth would be exceeded.
442 if (!pageContext.skipLinks) {
443 await this._handleLinks(page, request);
444 }
445
446 // Save the `pageFunction`s result (or just metadata) to the default dataset.
447 await this._handleResult(request, response, pageFunctionResult);
448
449 tools.logPerformance(request, 'handlePageFunction POSTPROCESSING', finishUserFn);
450 tools.logPerformance(request, 'handlePageFunction EXECUTION', start);
451 }
452
453 async _handleMaxResultsPerCrawl(autoscaledPool) {
454 if (!this.input.maxResultsPerCrawl || this.pagesOutputted < this.input.maxResultsPerCrawl) return false;
455 log.info(`User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`);
456 await autoscaledPool.abort();
457 return true;
458 }
459
460 async _handleLinks(page, request) {
461 if (!(this.input.linkSelector && this.requestQueue)) return;
462 const start = process.hrtime();
463
464 const currentDepth = request.userData[META_KEY].depth;
465 const hasReachedMaxDepth = this.input.maxCrawlingDepth && currentDepth >= this.input.maxCrawlingDepth;
466 if (hasReachedMaxDepth) {
467 log.debug(`Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`);
468 return;
469 }
470
471 const enqueueOptions = {
472 page,
473 selector: this.input.linkSelector,
474 pseudoUrls: this.input.pseudoUrls,
475 requestQueue: this.requestQueue,
476 transformRequestFunction: (requestOptions) => {
477 requestOptions.userData = {
478 [META_KEY]: {
479 parentRequestId: request.id || request.uniqueKey,
480 depth: currentDepth + 1,
481 },
482 };
483 requestOptions.useExtendedUniqueKey = true;
484 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
485 return requestOptions;
486 },
487 };
488
489 await Apify.utils.enqueueLinks(enqueueOptions);
490
491 tools.logPerformance(request, 'handleLinks EXECUTION', start);
492 }
493
494 async _handleResult(request, response, pageFunctionResult, isError) {
495 const start = process.hrtime();
496 const payload = tools.createDatasetPayload(request, response, pageFunctionResult, isError);
497 await Apify.pushData(payload);
498 this.pagesOutputted++;
499 tools.logPerformance(request, 'handleResult EXECUTION', start);
500 }
501
502 async _assertNamespace(page, namespace) {
503 try {
504 await page.waitFor(nmspc => !!window[nmspc], { timeout: this.input.pageLoadTimeoutSecs * 1000 }, namespace);
505 } catch (err) {
506 if (err.stack.startsWith('TimeoutError')) {
507 throw new Error('Injection of environment into the browser context timed out. '
508 + 'If this persists even after retries, try increasing the Page load timeout input setting.');
509 } else {
510 throw err;
511 }
512 }
513 }
514
515 async _waitForLoadEventWhenXml(page, response) {
516 // Response can sometimes be null.
517 if (!response) return;
518
519 const cTypeHeader = response.headers()['content-type'];
520 try {
521 const { type } = contentType.parse(cTypeHeader);
522 if (!/^(text|application)\/xml$|\+xml$/.test(type)) return;
523 } catch (err) {
524 // Invalid type is not XML.
525 return;
526 }
527
528 try {
529 const timeout = this.input.pageLoadTimeoutSecs * 1000;
530 await page.waitFor(() => document.readyState === 'complete', { timeout });
531 } catch (err) {
532 if (err.stack.startsWith('TimeoutError')) {
533 throw new Error('Parsing of XML in the page timed out. If you\'re expecting a large XML file, '
534 + ' such as a site map, try increasing the Page load timeout input setting.');
535 } else {
536 throw err;
537 }
538 }
539 }
540
541 async _injectBrowserHandles(page, pageContext) {
542 const saveSnapshotP = browserTools.createBrowserHandle(page, () => browserTools.saveSnapshot({ page }));
543 const skipLinksP = browserTools.createBrowserHandle(page, () => { pageContext.skipLinks = true; });
544 const globalStoreP = browserTools.createBrowserHandlesForObject(
545 page,
546 this.globalStore,
547 ['size', 'clear', 'delete', 'entries', 'get', 'has', 'keys', 'set', 'values'],
548 );
549 const logP = browserTools.createBrowserHandlesForObject(
550 page,
551 log,
552 ['LEVELS', 'setLevel', 'getLevel', 'debug', 'info', 'warning', 'error', 'exception'],
553 );
554 const apifyP = browserTools.createBrowserHandlesForObject(page, Apify, ['getValue', 'setValue']);
555 const requestQueueP = this.requestQueue
556 ? browserTools.createBrowserHandlesForObject(page, this.requestQueue, ['addRequest'])
557 : null;
558
559 const [
560 saveSnapshot,
561 skipLinks,
562 globalStore,
563 logHandle,
564 apify,
565 requestQueue,
566 ] = await Promise.all([saveSnapshotP, skipLinksP, globalStoreP, logP, apifyP, requestQueueP]);
567
568 const handles = {
569 saveSnapshot,
570 skipLinks,
571 globalStore,
572 log: logHandle,
573 apify,
574 };
575 if (requestQueue) handles.requestQueue = requestQueue;
576 return handles;
577 }
578}
579
580module.exports = CrawlerSetup;