
Web Scraper Experimental Debug
Pricing
Pay per usage
Go to Store

Web Scraper Experimental Debug
Experimental version of Apify Web Scraper with Chrome debugger integrated
0.0 (0)
Pricing
Pay per usage
3
Total users
65
Monthly users
2
Runs succeeded
94%
Last modified
4 years ago
.eslintrc.json
{ "extends": "airbnb-base", "plugins": [ "import", "promise" ], "rules": { "indent": ["error", 4], "no-underscore-dangle": [2, { "allow": ["_id"], "allowAfterThis": true }], "no-use-before-define": 0, "no-param-reassign": 0, "consistent-return": 0, "array-callback-return": 0, "object-curly-newline": 0, "arrow-body-style": 0, "no-plusplus": 0, "strict": ["error", "global"], "max-len": ["error", 150], "no-undef": 0, "func-names": 0, "import/prefer-default-export": 0, "import/no-absolute-path": 0, "import/no-extraneous-dependencies": ["error", { "devDependencies": ["**/test/**/*.js"] }], "no-await-in-loop": 0, "no-restricted-syntax": ["error", "ForInStatement", "LabeledStatement", "WithStatement"] }}
.gitignore
.ideaapify_storagenode_modules.nyc_outputcoveragesrc/page_function.js.DS_Store
Dockerfile
FROM apify/actor-node-chrome-xvfb
COPY package.json package-lock.json ./
# Install default dependencies, print versions of everythingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && npm list || true \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
COPY . ./
INPUT_SCHEMA.json
{ "title": "Web Scraper Input", "type": "object", "description": "Use the following form to configure Web Scraper. The Start URLs and Page function are required and all other fields are optional. If you want to use Pseudo URLs, you also need to Use request queue. To learn more about the different options, click on their title or on the nearby question marks. For details about the Page function visit the actor's README in the ACTOR INFO tab. We also have a <a href=\"https://apify.com/docs/scraping/tutorial/introduction\" target=\"_blank\">step by step tutorial</a> to help you with the basics.\n\nIf you get stuck and it seems that you can no longer progress with the Web Scraper, try <a href=\"https://apify.com/docs/scraping/tutorial/puppeteer-scraper\" target=\"_blank\">Puppeteer Scraper</a>. It's a more powerful tool.", "schemaVersion": 1, "properties": { "startUrls": { "sectionCaption": "Basic configuration", "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources" }, "useRequestQueue": { "title": "Use request queue", "type": "boolean", "description": "Request queue enables recursive crawling and the use of Pseudo-URLs, Link selector and <code>context.enqueueRequest()</code>.", "default": true }, "pseudoUrls": { "title": "Pseudo-URLs", "type": "array", "description": "Pseudo-URLs to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Pseudo-URLs will cause the scraper to enqueue all links matched by the Link selector.", "editor": "pseudoUrls", "default": [], "prefill": [ { "purl": "https://apify.com[(/[\\w-]+)?]" } ] }, "linkSelector": { "title": "Link selector", "type": "string", "description": "CSS selector matching elements with 'href' attributes that should be enqueued. To enqueue urls from <code><div class=\"my-class\" href=...></code> tags, you would enter <strong>div.my-class</strong>. Leave empty to ignore all links.", "editor": "textfield", "prefill": "a" }, "keepUrlFragments": { "title": "Keep URL fragments", "type": "boolean", "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.", "default": false }, "pageFunction": { "title": "Page function", "type": "string", "description": "Function executed for each request", "prefill": "async function pageFunction(context) {\n // See README for context properties. If the syntax is unfamiliar see the link\n // https://javascript.info/destructuring-assignment#object-destructuring\n const { request, log, jQuery } = context;\n\n // To be able to use jQuery as $, one needs save it into a variable\n // and select the inject jQuery option. We've selected it for you.\n const $ = jQuery;\n const title = $('title').text();\n\n // This is yet another new feature of Javascript called template strings.\n // https://javascript.info/string#quotes\n log.info(`URL: ${request.url} TITLE: ${title}`);\n\n // To save data just return an object with the requested properties.\n return {\n url: request.url,\n title\n };\n}", "editor": "javascript" }, "injectJQuery": { "title": "jQuery", "type": "boolean", "description": "The jQuery library will be injected into each page. If the page already uses jQuery, conflicts may arise.", "default": true, "groupCaption": "Inject library", "groupDescription": "Settings that help inject 3rd party libraries into scraper pageFunction." }, "injectUnderscore": { "title": "Underscore", "type": "boolean", "description": "The Underscore.js library will be injected into each page. If the page already uses Underscore.js (or other libraries that attach to '_', such as Lodash), conflicts may arise.", "default": false }, "proxyConfiguration": { "sectionCaption": "Proxy and browser configuration", "title": "Proxy configuration", "type": "object", "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.", "prefill": { "useApifyProxy": false }, "default": {}, "editor": "proxy" }, "initialCookies": { "title": "Initial cookies", "type": "array", "description": "The provided cookies will be pre-set to all pages the scraper opens.", "default": [], "prefill": [], "editor": "json" }, "useChrome": { "title": "Use Chrome", "type": "boolean", "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.", "default": false, "groupCaption": "Browser masking", "groupDescription": "Settings that help mask as a real user and prevent scraper detection, but increase the chances of the scraper not working at all with some websites." }, "useStealth": { "title": "Use Stealth", "type": "boolean", "description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.", "default": false }, "ignoreSslErrors": { "title": "Ignore SSL errors", "type": "boolean", "description": "Scraper will ignore SSL certificate errors.", "default": false, "groupCaption": "Security", "groupDescription": "Various options that help you disable browser security features such as HTTPS certificates and CORS." }, "ignoreCorsAndCsp": { "title": "Ignore CORS and CSP", "type": "boolean", "description": "Scraper will ignore CSP (content security policy) and CORS (cross origin resource sharing) settings of visited pages and requested domains. This enables you to freely use XHR/Fetch to make HTTP requests from the scraper.", "default": false }, "downloadMedia": { "sectionCaption": "Performance and limits", "title": "Download media", "type": "boolean", "description": "Scraper will download media such as images, fonts, videos and sounds. Disabling this may speed up the scrape, but certain websites could stop working correctly.", "default": true, "groupCaption": "Resources", "groupDescription": "Settings that help to disable downloading web page resources." }, "downloadCss": { "title": "Download CSS", "type": "boolean", "description": "Scraper will download CSS stylesheets. Disabling this may speed up the scrape, but certain websites could stop working correctly.", "default": true }, "maxRequestRetries": { "title": "Max request retries", "type": "integer", "description": "Maximum number of times the request for the page will be retried in case of an error. Setting it to 0 means that the request will be attempted once and will not be retried if it fails.", "minimum": 0, "default": 3, "unit": "retries" }, "maxPagesPerCrawl": { "title": "Max pages per run", "type": "integer", "description": "Maximum number of pages that the scraper will open. 0 means unlimited.", "minimum": 0, "default": 0, "unit": "pages" }, "maxResultsPerCrawl": { "title": "Max result records", "type": "integer", "description": "Maximum number of results that will be saved to dataset. The scraper will terminate afterwards. 0 means unlimited.", "minimum": 0, "default": 0, "unit": "results" }, "maxCrawlingDepth": { "title": "Max crawling depth", "type": "integer", "description": "Defines how many links away from the StartURLs will the scraper descend. 0 means unlimited.", "minimum": 0, "default": 0 }, "maxConcurrency": { "title": "Max concurrency", "type": "integer", "description": "Defines how many pages can be processed by the scraper in parallel. The scraper automatically increases and decreases concurrency based on available system resources. Use this option to set a hard limit.", "minimum": 1, "default": 50 }, "pageLoadTimeoutSecs": { "title": "Page load timeout", "type": "integer", "description": "Maximum time the scraper will allow a web page to load in seconds.", "minimum": 1, "default": 60, "maximum": 360, "unit": "secs" }, "pageFunctionTimeoutSecs": { "title": "Page function timeout", "type": "integer", "description": "Maximum time the scraper will wait for the page function to execute in seconds.", "minimum": 1, "default": 60, "maximum": 360, "unit": "secs" }, "waitUntil": { "title": "Navigation wait until", "type": "array", "description": "The scraper will wait until the selected events are triggered in the page before executing the page function. Available events are <code>domcontentloaded</code>, <code>load</code>, <code>networkidle2</code> and <code>networkidle0</code>. <a href=\"https://pptr.dev/#?product=Puppeteer&show=api-pagegotourl-options\" target=\"_blank\">See Puppeteer docs</a>.", "default": ["networkidle2"], "prefill": ["networkidle2"], "editor": "json" }, "debugLog": { "sectionCaption": "Advanced configuration", "title": "Debug log", "type": "boolean", "description": "Debug messages will be included in the log. Use <code>context.log.debug('message')</code> to log your own debug messages.", "default": false, "groupCaption": "Enable logs", "groupDescription": "Logs settings" }, "browserLog": { "title": "Browser log", "type": "boolean", "description": "Console messages from the Browser will be included in the log. This may result in the log being flooded by error messages, warnings and other messages of little value, especially with high concurrency.", "default": false }, "chromeDebugger": { "title": "Chrome debugger [experimental]", "type": "boolean", "description": "Experimental implementation of Chrome debugger. In this mode the scraper will run with single browser on concurrency 1. You can place <code>debugger;</code> into your page function to set up a debugger breakpoint.", "default": false }, "customData": { "title": "Custom data", "type": "object", "description": "This object will be available on pageFunction's context as customData.", "default": {}, "prefill": {}, "editor": "json" } }, "required": ["startUrls", "pageFunction"]}
LICENSE
Apache License Version 2.0, January 2004 http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
(a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
apify.json
{ "name": "web-scraper-experimental-debug", "version": "0.1", "buildTag": "latest", "env": null}
main.js
1const { runActor } = require('@apify/scraper-tools');2const CrawlerSetup = require('./src/crawler_setup');3
4runActor(CrawlerSetup);
package.json
{ "name": "actor-web-scraper", "version": "1.0.0", "description": "Crawl web pages using Apify, headless Chrome and Puppeteer", "main": "src/main.js", "dependencies": { "@apify/scraper-tools": "^0.1.2", "apify": "^0.16.0", "content-type": "^1.0.4", "http-proxy": "^1.18.0", "request-promise": "^4.2.4", "underscore": "^1.9.1", "url": "^0.11.0" }, "devDependencies": { "@types/puppeteer": "^1.19.0", "@types/underscore": "^1.9.2", "chai": "^4.2.0", "eslint": "^6.1.0", "eslint-config-airbnb": "^17.1.1", "eslint-config-airbnb-base": "^13.2.0", "eslint-plugin-import": "^2.18.2", "eslint-plugin-jsx-a11y": "^6.2.3", "eslint-plugin-promise": "^4.0.1", "eslint-plugin-react": "^7.14.3", "markdown-toc": "^1.2.0", "mocha": "^6.2.0", "nyc": "^14.1.1", "sinon": "^7.1.1" }, "scripts": { "start": "node main.js", "build-toc": "./node_modules/.bin/markdown-toc README.md -i", "test": "nyc --reporter=html --reporter=text mocha --timeout 60000 --recursive", "prepare": "npm run build-toc", "lint": "eslint src test" }, "repository": { "type": "git", "url": "https://github.com/apifytech/actor-scraper" }, "author": { "name": "Apify", "email": "support@apify.com", "url": "https://www.apify.com" }, "contributors": [ "Marek Trunkat <marek@apify.com>", "Ondra Urban <ondra@apify.com>" ], "license": "Apache-2.0", "homepage": "https://github.com/apifytech/actor-scraper"}
src/bundle.browser.js
1/* istanbul ignore next */2/**3 * Command to be evaluated for Browser side code injection.4 * @param apifyNamespace5 */6module.exports = (apifyNamespace) => {7 (function (global, namespace) {8 if (typeof window[namespace] !== 'object') window[namespace] = {};9 /**10 * Takes a configuration object with function references11 * as an input and creates a dummy class that proxies12 * function invocations from Browser context to Node context.13 *14 * The configuration is to be provided by the15 * tools.createBrowserHandlesForObject function.16 */17 class NodeProxy {18 constructor(config) {19 if (!config || typeof config !== 'object') {20 throw new Error('NodeProxy: Parameter config of type Object must be provided.');21 }22
23 Object.entries(config)24 .forEach(([key, { value, type }]) => {25 if (type === 'METHOD') {26 this[key] = (...args) => global[value](...args);27 } else if (type === 'GETTER') {28 Object.defineProperty(this, key, {29 get: () => global[value](),30 });31 } else if (type === 'VALUE') {32 this[key] = value;33 } else {34 throw new Error(`Unsupported function type: ${type} for function: ${key}.`);35 }36 });37 }38 }39
40 /**41 * Exposed factory.42 * @param config43 * @return {NodeProxy}44 */45 global[namespace].createNodeProxy = config => new NodeProxy(config);46
47 const setup = Symbol('crawler-setup');48 const internalState = Symbol('request-internal-state');49
50 /**51 * Context represents everything that is available to the user52 * via Page Function. A class is used instead of a simple object53 * to avoid having to create new instances of functions with each54 * request.55 *56 * Some properties need to be accessible to the Context,57 * but should not be exposed to the user thus they are hidden58 * using a Symbol to prevent the user from easily accessing59 * and manipulating them.60 *61 * @param {Object} options62 * @param {Object} options.crawlerSetup63 * @param {Object} options.browserHandles64 * @param {Object} options.pageFunctionArguments65 */66 class Context {67 /* eslint-disable class-methods-use-this */68 constructor(options) {69 const {70 crawlerSetup,71 browserHandles,72 pageFunctionArguments,73 } = options;74
75 const createProxy = global[namespace].createNodeProxy;76
77 // Private78 this[setup] = crawlerSetup;79 this[internalState] = {80 browserHandles,81 requestQueue: browserHandles.requestQueue ? createProxy(browserHandles.requestQueue) : null,82 apify: createProxy(browserHandles.apify),83 };84
85 // Copies of Node objects86 this.input = JSON.parse(crawlerSetup.rawInput);87 this.env = Object.assign({}, crawlerSetup.env);88 this.customData = crawlerSetup.customData;89 this.response = pageFunctionArguments.response;90 this.request = pageFunctionArguments.request;91 // Functions are not converted so we need to add them this way92 // to not be enumerable and thus not polluting the object.93 Reflect.defineProperty(this.request, 'pushErrorMessage', {94 value(errorOrMessage) {95 // It's a simplified fake of the original function.96 const msg = (errorOrMessage && errorOrMessage.message) || `${errorOrMessage}`;97 this.errorMessages.push(msg);98 },99 enumerable: false,100 });101
102 // Proxied Node objects103 this.globalStore = createProxy(browserHandles.globalStore);104 this.log = createProxy(browserHandles.log);105
106 // Browser side libraries107 if (this[setup].injectJQuery) this.jQuery = global.jQuery.noConflict(true);108 if (this[setup].injectUnderscore) this.underscoreJs = global._.noConflict();109
110 // Bind this to allow destructuring off context in pageFunction.111 this.getValue = this.getValue.bind(this);112 this.setValue = this.setValue.bind(this);113 this.saveSnapshot = this.saveSnapshot.bind(this);114 this.skipLinks = this.skipLinks.bind(this);115 this.enqueueRequest = this.enqueueRequest.bind(this);116 this.waitFor = this.waitFor.bind(this);117 }118
119 async getValue(...args) {120 return this[internalState].apify.getValue(...args);121 }122
123 async setValue(...args) {124 return this[internalState].apify.setValue(...args);125 }126
127 async saveSnapshot() {128 const handle = this[internalState].browserHandles.saveSnapshot;129 return global[handle]();130 }131
132 async skipLinks() {133 const handle = this[internalState].browserHandles.skipLinks;134 return global[handle]();135 }136
137 async enqueueRequest(requestOpts = {}, options = {}) {138 if (!this[setup].useRequestQueue) {139 throw new Error('Input parameter "useRequestQueue" must be set to true to be able to enqueue new requests.');140 }141
142 const defaultRequestOpts = {143 useExtendedUniqueKey: true,144 keepUrlFragment: this.input.keepUrlFragments,145 };146
147 const newRequest = { ...defaultRequestOpts, ...requestOpts };148
149 const metaKey = this[setup].META_KEY;150 const defaultUserData = {151 [metaKey]: {152 parentRequestId: this.request.id || this.request.uniqueKey,153 depth: this.request.userData[metaKey].depth + 1,154 },155 };156
157 newRequest.userData = { ...defaultUserData, ...requestOpts.userData };158
159 return this[internalState].requestQueue.addRequest(newRequest, options);160 }161
162 async waitFor(selectorOrNumberOrFunction, options = {}) {163 if (!options || typeof options !== 'object') throw new Error('Parameter options must be an Object');164 const type = typeof selectorOrNumberOrFunction;165 if (type === 'string') return this._waitForSelector(selectorOrNumberOrFunction, options);166 if (type === 'number') return this._waitForMillis(selectorOrNumberOrFunction);167 if (type === 'function') return this._waitForFunction(selectorOrNumberOrFunction, options);168 throw new Error('Parameter selectorOrNumberOrFunction must be one of the said types.');169 }170
171 async _waitForSelector(selector, options = {}) {172 try {173 await this._poll(() => {174 return !!global.document.querySelector(selector);175 }, options);176 } catch (err) {177 if (/timeout of \d+ms exceeded/.test(err.message)) {178 throw new Error(`Timeout Error: waiting for selector failed: ${err.message}`);179 }180 throw err;181 }182 }183
184 async _waitForMillis(millis) {185 return new Promise(res => setTimeout(res, millis));186 }187
188 async _waitForFunction(predicate, options = {}) {189 try {190 await this._poll(predicate, options);191 } catch (err) {192 if (/timeout of \d+ms exceeded/.test(err.message)) {193 throw new Error(`Timeout Error: waiting for function failed: ${err.message}`);194 }195 throw err;196 }197 }198
199 async _poll(predicate, options = {}) {200 const {201 pollingIntervalMillis = 50,202 timeoutMillis = 20000,203 } = options;204 return new Promise((resolve, reject) => {205 const handler = () => {206 return predicate() ? resolve() : setTimeout(handler);207 };208 const pollTimeout = setTimeout(handler, pollingIntervalMillis);209 setTimeout(() => {210 clearTimeout(pollTimeout);211 return reject(new Error(`timeout of ${timeoutMillis}ms exceeded.`));212 }, timeoutMillis);213 });214 }215 }216
217 /**218 * Exposed factory.219 * @param {Object} options220 * @returns {Context}221 */222 global[namespace].createContext = (options) => {223 return new Context(options);224 };225 }(window, apifyNamespace));226};
src/crawler_setup.js
1const Apify = require('apify');2const _ = require('underscore');3const httpProxy = require('http-proxy');4const url = require('url');5const contentType = require('content-type');6const {7 tools,8 browserTools,9 constants: { META_KEY, DEFAULT_VIEWPORT, DEVTOOLS_TIMEOUT_SECS },10} = require('@apify/scraper-tools');11const rp = require('request-promise');12
13const GlobalStore = require('./global_store');14const createBundle = require('./bundle.browser');15const SCHEMA = require('../INPUT_SCHEMA');16
17const CHROME_DEBUGGER_PORT = 4322;18
19const { utils: { log, puppeteer } } = Apify;20
21/**22 * Replicates the INPUT_SCHEMA with JavaScript types for quick reference23 * and IDE type check integration.24 *25 * @typedef {Object} Input26 * @property {Object[]} startUrls27 * @property {boolean} useRequestQueue28 * @property {Object[]} pseudoUrls29 * @property {string} linkSelector30 * @property {boolean} keepUrlFragments31 * @property {string} pageFunction32 * @property {Object} proxyConfiguration33 * @property {boolean} debugLog34 * @property {boolean} browserLog35 * @property {boolean} injectJQuery36 * @property {boolean} injectUnderscore37 * @property {boolean} downloadMedia38 * @property {boolean} downloadCss39 * @property {number} maxRequestRetries40 * @property {number} maxPagesPerCrawl41 * @property {number} maxResultsPerCrawl42 * @property {number} maxCrawlingDepth43 * @property {number} maxConcurrency44 * @property {number} pageLoadTimeoutSecs45 * @property {number} pageFunctionTimeoutSecs46 * @property {Object} customData47 * @property {Array} initialCookies48 * @property {Array} waitUntil49 * @property {boolean} useChrome50 * @property {boolean} useStealth51 * @property {boolean} ignoreCorsAndCsp52 * @property {boolean} ignoreSslErrors53 */54
55/**56 * Holds all the information necessary for constructing a crawler57 * instance and creating a context for a pageFunction invocation.58 */59class CrawlerSetup {60 /* eslint-disable class-methods-use-this */61 constructor(input) {62 this.name = 'Web Scraper';63 // Set log level early to prevent missed messages.64 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);65
66 // Keep this as string to be immutable.67 this.rawInput = JSON.stringify(input);68
69 // Attempt to load page function from disk if not present on input.70 tools.maybeLoadPageFunctionFromDisk(input, __dirname);71
72 // Validate INPUT if not running on Apify Cloud Platform.73 if (!Apify.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);74
75 /**76 * @type {Input}77 */78 this.input = input;79 this.env = Apify.getEnv();80
81 // Validations82 if (this.input.pseudoUrls.length && !this.input.useRequestQueue) {83 throw new Error('Cannot enqueue links using Pseudo URLs without using a Request Queue. '84 + 'Either select the "Use Request Queue" option to enable Request Queue or '85 + 'remove your Pseudo URLs.');86 }87 this.input.pseudoUrls.forEach((purl) => {88 if (!tools.isPlainObject(purl)) throw new Error('The pseudoUrls Array must only contain Objects.');89 if (purl.userData && !tools.isPlainObject(purl.userData)) throw new Error('The userData property of a pseudoUrl must be an Object.');90 });91 this.input.initialCookies.forEach((cookie) => {92 if (!tools.isPlainObject(cookie)) throw new Error('The initialCookies Array must only contain Objects.');93 });94 this.input.waitUntil.forEach((event) => {95 if (!/^(domcontentloaded|load|networkidle2|networkidle0)$/.test(event)) {96 throw new Error('Navigation wait until events must be valid. See tooltip.');97 }98 });99 tools.evalFunctionOrThrow(this.input.pageFunction);100
101 // Used to store page specific data.102 this.pageContexts = new WeakMap();103
104 // Used to store data that persist navigations105 this.globalStore = new GlobalStore();106
107 // Excluded resources108 this.blockedUrlPatterns = [];109 if (!this.input.downloadMedia) {110 this.blockedUrlPatterns = [...this.blockedUrlPatterns,111 '.jpg', '.jpeg', '.png', '.svg', '.gif', '.webp', '.webm', '.ico', '.woff', '.eot',112 ];113 }114 if (!this.input.downloadCss) this.blockedUrlPatterns.push('.css');115
116 // Start Chromium with Debugger any time the page function includes the keyword.117 this.devtools = this.input.pageFunction.includes('debugger;');118
119 // Initialize async operations.120 this.crawler = null;121 this.requestList = null;122 this.requestQueue = null;123 this.dataset = null;124 this.keyValueStore = null;125 this.proxy = null;126 this.initPromise = this._initializeAsync();127 }128
129 async _initializeAsync() {130 // RequestList131 const startUrls = this.input.startUrls.map((req) => {132 req.useExtendedUniqueKey = true;133 req.keepUrlFragment = this.input.keepUrlFragments;134 return req;135 });136 this.requestList = await Apify.openRequestList('WEB_SCRAPER', startUrls);137
138 // RequestQueue if selected139 if (this.input.useRequestQueue) this.requestQueue = await Apify.openRequestQueue();140
141 // Dataset142 this.dataset = await Apify.openDataset();143 const { itemsCount } = await this.dataset.getInfo();144 this.pagesOutputted = itemsCount || 0;145
146 // KeyValueStore147 this.keyValueStore = await Apify.openKeyValueStore();148 }149
150 /**151 * Resolves to a `PuppeteerCrawler` instance.152 * constructor.153 * @returns {Promise<PuppeteerCrawler>}154 */155 async createCrawler() {156 await this.initPromise;157
158 const args = [];159 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');160
161 const options = {162 handlePageFunction: this._handlePageFunction.bind(this),163 requestList: this.requestList,164 requestQueue: this.requestQueue,165 handlePageTimeoutSecs: this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageFunctionTimeoutSecs,166 gotoFunction: this._gotoFunction.bind(this),167 handleFailedRequestFunction: this._handleFailedRequestFunction.bind(this),168 maxConcurrency: this.input.maxConcurrency,169 maxRequestRetries: this.input.maxRequestRetries,170 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,171 proxyUrls: this.input.proxyConfiguration.proxyUrls,172 // launchPuppeteerFunction: use default,173 puppeteerPoolOptions: {174 useLiveView: true,175 recycleDiskCache: true,176 },177 launchPuppeteerOptions: {178 ...(_.omit(this.input.proxyConfiguration, 'proxyUrls')),179 ignoreHTTPSErrors: this.input.ignoreSslErrors,180 defaultViewport: DEFAULT_VIEWPORT,181 devtools: this.devtools,182 headless: false,183 useChrome: this.input.useChrome,184 stealth: this.input.useStealth,185 args,186 },187 };188
189 if (this.input.chromeDebugger) {190 log.info(`Starting scraper with Chrome debugger attached:191 - Scraper will run with concurrency 1.192 - The whole run will be performed with a single browser (= 1 proxy IP).193 - You can place "debugger;" to you page function to set up a break point.`);194
195 options.puppeteerPoolOptions.useLiveView = false;196 options.puppeteerPoolOptions.retireInstanceAfterRequestCount = 1000000;197 options.maxConcurrency = 1;198 options.launchPuppeteerFunction = async (launchPuppeteerOptions) => {199 launchPuppeteerOptions.args = [`--remote-debugging-port=${CHROME_DEBUGGER_PORT}`, '--user-data-dir=remote-profile'];200 launchPuppeteerOptions.defaultViewport = { width: 720, height: 1024 };201
202 const browser = await Apify.launchPuppeteer(launchPuppeteerOptions);203 const containerHost = url.parse(process.env.APIFY_CONTAINER_URL).host;204
205 // Start HTTP proxy from live view port to Chrome debugger port.206 if (this.proxy) proxy.close();207 this.proxy = httpProxy.createServer({208 target: {209 host: 'localhost',210 port: CHROME_DEBUGGER_PORT,211 },212 ws: true,213 }).listen(4321);214
215 // Fetches debugger URL of page we are scraping.216 const debuggerUrlPromise = (async () => {217 while (true) {218 const pageList = await rp({ url: `http://localhost:${CHROME_DEBUGGER_PORT}/json/list`, json: true });219
220 // All the internal debugger pages start with something like "devtools://"...221 const debuggerPage = pageList.filter(page => page.url.startsWith('http')).pop();222 if (debuggerPage) return debuggerPage;223
224 await Apify.utils.sleep(1000);225 }226 })();227
228 // Intercept requests to HTTP proxy.229 this.proxy.on('proxyReq', async (proxyReq, req, res) => {230 // We need Chrome to think that it's on localhost otherwise it throws an error...231 proxyReq.setHeader('Host', 'localhost');232
233 // URL of debugger of a page we are scraping.234 let targetUrl = req.url;235 if (targetUrl === '/') {236 proxyReq.socket.pause(); // We need to pause it to be able to perform async operations.237 targetUrl = (await debuggerUrlPromise).devtoolsFrontendUrl; // This one includes ws=localhost so redirect below will be performed238 targetUrl = targetUrl.replace(`:${CHROME_DEBUGGER_PORT}`, '');239 proxyReq.socket.resume();240 }241
242 // ...but then Chrome things it's running on localhost so we need to change websocket243 // host in URL to be correct.244 if (targetUrl.includes('ws=localhost')) {245 const newUrl = targetUrl.replace('ws=localhost', `wss=${containerHost}`);246 res.writeHead(301, { Location: newUrl });247 res.end();248 }249 });250
251 return browser;252 };253 }254
255 this.crawler = new Apify.PuppeteerCrawler(options);256
257 if (this.input.chromeDebugger) {258 // 🤷 this.puppeteerPool is set ip in crawler.run() which is called somewhere??259 const interval = setInterval(() => {260 if (this.crawler.puppeteerPool) {261 this.crawler.puppeteerPool.reusePages = true;262 clearInterval(interval);263 }264 }, 10);265 }266
267 return this.crawler;268 }269
270 async _gotoFunction({ request, page }) {271 const start = process.hrtime();272
273 // Create a new page context with a new random key for Apify namespace.274 const pageContext = {275 apifyNamespace: await tools.createRandomHash(),276 skipLinks: false,277 };278 this.pageContexts.set(page, pageContext);279
280 // Attach a console listener to get all logs as soon as possible.281 if (this.input.browserLog) browserTools.dumpConsole(page);282
283 // Prevent download of stylesheets and media, unless selected otherwise284 if (this.blockedUrlPatterns.length) {285 await puppeteer.blockRequests(page, {286 urlPatterns: this.blockedUrlPatterns,287 });288 }289
290 // Add initial cookies, if any.291 if (this.input.initialCookies.length) await page.setCookie(...this.input.initialCookies);292
293 // Disable content security policy.294 if (this.input.ignoreCorsAndCsp) await page.setBypassCSP(true);295
296 tools.logPerformance(request, 'gotoFunction INIT', start);297 const handleStart = process.hrtime();298 pageContext.browserHandles = await this._injectBrowserHandles(page, pageContext);299 tools.logPerformance(request, 'gotoFunction INJECTION HANDLES', handleStart);300
301 const evalStart = process.hrtime();302 await Promise.all([303 page.evaluateOnNewDocument(createBundle, pageContext.apifyNamespace),304 page.evaluateOnNewDocument(browserTools.wrapPageFunction(this.input.pageFunction, pageContext.apifyNamespace)),305 ]);306 tools.logPerformance(request, 'gotoFunction INJECTION EVAL', evalStart);307
308
309 // Invoke navigation.310 const navStart = process.hrtime();311 const response = await puppeteer.gotoExtended(page, request, {312 timeout: (this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageLoadTimeoutSecs) * 1000,313 waitUntil: this.input.waitUntil,314 });315 await this._waitForLoadEventWhenXml(page, response);316 tools.logPerformance(request, 'gotoFunction NAVIGATION', navStart);317
318 const delayStart = process.hrtime();319 await this._assertNamespace(page, pageContext.apifyNamespace);320
321 // Inject selected libraries322 if (this.input.injectJQuery) await puppeteer.injectJQuery(page);323 if (this.input.injectUnderscore) await puppeteer.injectUnderscore(page);324
325 tools.logPerformance(request, 'gotoFunction INJECTION DELAY', delayStart);326 tools.logPerformance(request, 'gotoFunction EXECUTION', start);327 return response;328 }329
330 _handleFailedRequestFunction({ request }) {331 const lastError = request.errorMessages[request.errorMessages.length - 1];332 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';333 log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);334 return this._handleResult(request, {}, null, true);335 }336
337 /**338 * First of all, it initializes the state that is exposed to the user via339 * `pageFunction` context.340 *341 * Then it invokes the user provided `pageFunction` with the prescribed context342 * and saves its return value.343 *344 * Finally, it makes decisions based on the current state and post-processes345 * the data returned from the `pageFunction`.346 * @param {Object} environment347 * @returns {Function}348 */349 async _handlePageFunction({ request, response, page, autoscaledPool }) {350 const start = process.hrtime();351
352 const pageContext = this.pageContexts.get(page);353
354 /**355 * PRE-PROCESSING356 */357 // Make sure that an object containing internal metadata358 // is present on every request.359 tools.ensureMetaData(request);360
361 // Abort the crawler if the maximum number of results was reached.362 const aborted = await this._handleMaxResultsPerCrawl(autoscaledPool);363 if (aborted) return;364
365 // Setup Context and pass the configuration down to Browser.366 const contextOptions = {367 crawlerSetup: Object.assign(368 _.pick(this, ['rawInput', 'env']),369 _.pick(this.input, ['customData', 'useRequestQueue', 'injectJQuery', 'injectUnderscore']),370 { META_KEY },371 ),372 browserHandles: pageContext.browserHandles,373 pageFunctionArguments: {374 request,375 response: {376 status: response && response.status(),377 headers: response && response.headers(),378 },379 },380 };381
382 /**383 * USER FUNCTION EXECUTION384 */385 tools.logPerformance(request, 'handlePageFunction PREPROCESSING', start);386 const startUserFn = process.hrtime();387
388 const namespace = pageContext.apifyNamespace;389 const output = await page.evaluate(async (ctxOpts, namespc) => {390 /* eslint-disable no-shadow */391 const context = window[namespc].createContext(ctxOpts);392 const output = {};393 try {394 output.pageFunctionResult = await window[namespc].pageFunction(context);395 } catch (err) {396 output.pageFunctionError = Object.getOwnPropertyNames(err)397 .reduce((memo, name) => {398 memo[name] = err[name];399 return memo;400 }, {});401 }402 // This needs to be added after pageFunction has run.403 output.requestFromBrowser = context.request;404
405 /**406 * Since Dates cannot travel back to Node and Puppeteer does not use .toJSON407 * to stringify, they come back as empty objects. We could use JSON.stringify408 * ourselves, but that exposes us to overridden .toJSON in the target websites.409 * This hack is not ideal, but it avoids both problems.410 */411 function replaceAllDatesInObjectWithISOStrings(obj) {412 for (const [key, value] of Object.entries(obj)) {413 if (value instanceof Date && typeof value.toISOString === 'function') {414 obj[key] = value.toISOString();415 } else if (value && typeof value === 'object') {416 replaceAllDatesInObjectWithISOStrings(value);417 }418 }419 return obj;420 }421
422 return replaceAllDatesInObjectWithISOStrings(output);423 }, contextOptions, namespace);424
425 tools.logPerformance(request, 'handlePageFunction USER FUNCTION', startUserFn);426 const finishUserFn = process.hrtime();427
428 /**429 * POST-PROCESSING430 */431 const { pageFunctionResult, requestFromBrowser, pageFunctionError } = output;432 // Merge requestFromBrowser into request to preserve modifications that433 // may have been made in browser context.434 Object.assign(request, requestFromBrowser);435
436 // Throw error from pageFunction, if any.437 if (pageFunctionError) throw tools.createError(pageFunctionError);438
439 // Enqueue more links if a link selector is available,440 // unless the user invoked the `skipLinks()` context function441 // or maxCrawlingDepth would be exceeded.442 if (!pageContext.skipLinks) {443 await this._handleLinks(page, request);444 }445
446 // Save the `pageFunction`s result (or just metadata) to the default dataset.447 await this._handleResult(request, response, pageFunctionResult);448
449 tools.logPerformance(request, 'handlePageFunction POSTPROCESSING', finishUserFn);450 tools.logPerformance(request, 'handlePageFunction EXECUTION', start);451 }452
453 async _handleMaxResultsPerCrawl(autoscaledPool) {454 if (!this.input.maxResultsPerCrawl || this.pagesOutputted < this.input.maxResultsPerCrawl) return false;455 log.info(`User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`);456 await autoscaledPool.abort();457 return true;458 }459
460 async _handleLinks(page, request) {461 if (!(this.input.linkSelector && this.requestQueue)) return;462 const start = process.hrtime();463
464 const currentDepth = request.userData[META_KEY].depth;465 const hasReachedMaxDepth = this.input.maxCrawlingDepth && currentDepth >= this.input.maxCrawlingDepth;466 if (hasReachedMaxDepth) {467 log.debug(`Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`);468 return;469 }470
471 const enqueueOptions = {472 page,473 selector: this.input.linkSelector,474 pseudoUrls: this.input.pseudoUrls,475 requestQueue: this.requestQueue,476 transformRequestFunction: (requestOptions) => {477 requestOptions.userData = {478 [META_KEY]: {479 parentRequestId: request.id || request.uniqueKey,480 depth: currentDepth + 1,481 },482 };483 requestOptions.useExtendedUniqueKey = true;484 requestOptions.keepUrlFragment = this.input.keepUrlFragments;485 return requestOptions;486 },487 };488
489 await Apify.utils.enqueueLinks(enqueueOptions);490
491 tools.logPerformance(request, 'handleLinks EXECUTION', start);492 }493
494 async _handleResult(request, response, pageFunctionResult, isError) {495 const start = process.hrtime();496 const payload = tools.createDatasetPayload(request, response, pageFunctionResult, isError);497 await Apify.pushData(payload);498 this.pagesOutputted++;499 tools.logPerformance(request, 'handleResult EXECUTION', start);500 }501
502 async _assertNamespace(page, namespace) {503 try {504 await page.waitFor(nmspc => !!window[nmspc], { timeout: this.input.pageLoadTimeoutSecs * 1000 }, namespace);505 } catch (err) {506 if (err.stack.startsWith('TimeoutError')) {507 throw new Error('Injection of environment into the browser context timed out. '508 + 'If this persists even after retries, try increasing the Page load timeout input setting.');509 } else {510 throw err;511 }512 }513 }514
515 async _waitForLoadEventWhenXml(page, response) {516 // Response can sometimes be null.517 if (!response) return;518
519 const cTypeHeader = response.headers()['content-type'];520 try {521 const { type } = contentType.parse(cTypeHeader);522 if (!/^(text|application)\/xml$|\+xml$/.test(type)) return;523 } catch (err) {524 // Invalid type is not XML.525 return;526 }527
528 try {529 const timeout = this.input.pageLoadTimeoutSecs * 1000;530 await page.waitFor(() => document.readyState === 'complete', { timeout });531 } catch (err) {532 if (err.stack.startsWith('TimeoutError')) {533 throw new Error('Parsing of XML in the page timed out. If you\'re expecting a large XML file, '534 + ' such as a site map, try increasing the Page load timeout input setting.');535 } else {536 throw err;537 }538 }539 }540
541 async _injectBrowserHandles(page, pageContext) {542 const saveSnapshotP = browserTools.createBrowserHandle(page, () => browserTools.saveSnapshot({ page }));543 const skipLinksP = browserTools.createBrowserHandle(page, () => { pageContext.skipLinks = true; });544 const globalStoreP = browserTools.createBrowserHandlesForObject(545 page,546 this.globalStore,547 ['size', 'clear', 'delete', 'entries', 'get', 'has', 'keys', 'set', 'values'],548 );549 const logP = browserTools.createBrowserHandlesForObject(550 page,551 log,552 ['LEVELS', 'setLevel', 'getLevel', 'debug', 'info', 'warning', 'error', 'exception'],553 );554 const apifyP = browserTools.createBrowserHandlesForObject(page, Apify, ['getValue', 'setValue']);555 const requestQueueP = this.requestQueue556 ? browserTools.createBrowserHandlesForObject(page, this.requestQueue, ['addRequest'])557 : null;558
559 const [560 saveSnapshot,561 skipLinks,562 globalStore,563 logHandle,564 apify,565 requestQueue,566 ] = await Promise.all([saveSnapshotP, skipLinksP, globalStoreP, logP, apifyP, requestQueueP]);567
568 const handles = {569 saveSnapshot,570 skipLinks,571 globalStore,572 log: logHandle,573 apify,574 };575 if (requestQueue) handles.requestQueue = requestQueue;576 return handles;577 }578}579
580module.exports = CrawlerSetup;
src/global_store.js
1/**2 * GlobalStore is a trivial storage that resembles a Map to be used from Browser contexts3 * to retain data through page navigations and browser instances. It limits Map's functionality4 * because it's currently impossible for functions and object references to cross Node-Browser threshold.5 */6class GlobalStore extends Map {7 /* eslint-disable class-methods-use-this */8 get(key) {9 if (typeof key !== 'string') throw new Error('GlobalStore#get parameter "key" must be a string.');10 return super.get(key);11 }12
13 set(key, value) {14 if (typeof key !== 'string') throw new Error('GlobalStore#set parameter "key" must be a string.');15 return super.set(key, value);16 }17
18 forEach() {19 throw new Error('GlobalStore#forEach function is not available due to underlying technology limitations.');20 }21
22 values() {23 return Array.from(super.values());24 }25
26 keys() {27 return Array.from(super.keys());28 }29
30 entries() {31 return Array.from(super.entries());32 }33
34 get size() {35 return super.size;36 }37}38
39module.exports = GlobalStore;
test/bundle.browser.js
1const { expect } = require('chai');2const Apify = require('apify');3const createBundle = require('../src/bundle.browser');4
5const NAMESPACE = 'Apify';6
7describe('Bundle', () => {8 let browser;9 let page;10 before(async () => {11 browser = await Apify.launchPuppeteer({ headless: true });12 });13 after(async () => {14 await browser.close();15 });16 beforeEach(async () => {17 page = await browser.newPage();18 await page.evaluateOnNewDocument(createBundle, NAMESPACE);19 });20 afterEach(async () => {21 await page.close();22 });23 describe('Context', () => {24 const CONTEXT_OPTIONS = {25 crawlerSetup: {26 rawInput: '{}',27
28 },29 browserHandles: {30 apify: {},31 globalStore: {},32 log: {},33 },34 pageFunctionArguments: {35 request: {},36 },37 };38 beforeEach(async () => {39 await page.goto('about:chrome');40 await page.waitFor(namespace => !!window[namespace], {}, NAMESPACE);41 await page.evaluate((namespace, contextOptions) => {42 window.contextInstance = window[namespace].createContext(contextOptions);43 }, NAMESPACE, CONTEXT_OPTIONS);44 });45 describe('waitFor', async () => {46 it('should work with a number', async () => {47 const millis = await page.evaluate(async () => {48 const ctx = window.contextInstance;49 const start = Date.now();50 await ctx.waitFor(10);51 return Date.now() - start;52 });53 expect(millis).to.be.above(9);54 });55 it('should work with a selector', async () => {56 const millis = await page.evaluate(async () => {57 const ctx = window.contextInstance;58 const start = Date.now();59 setTimeout(() => {60 const el = document.createElement('div');61 el.id = 'very-unique-id';62 document.body.appendChild(el);63 }, 10);64 await ctx.waitFor('#very-unique-id');65 return Date.now() - start;66 });67 expect(millis).to.be.above(9);68 });69 it('should work with a function', async () => {70 const millis = await page.evaluate(async () => {71 const ctx = window.contextInstance;72 let done = false;73 const start = Date.now();74 setTimeout(() => {75 done = true;76 }, 10);77 await ctx.waitFor(() => done);78 return Date.now() - start;79 });80 expect(millis).to.be.above(9);81 });82 });83 });84});