Web Scraper Experimental Debug
Try for free
No credit card required
Go to Store
Web Scraper Experimental Debug
mtrunkat/web-scraper-experimental-dbgr
Try for free
No credit card required
Experimental version of Apify Web Scraper with Chrome debugger integrated
.eslintrc.json
1{
2 "extends": "airbnb-base",
3 "plugins": [
4 "import",
5 "promise"
6 ],
7 "rules": {
8 "indent": ["error", 4],
9 "no-underscore-dangle": [2, {
10 "allow": ["_id"],
11 "allowAfterThis": true
12 }],
13 "no-use-before-define": 0,
14 "no-param-reassign": 0,
15 "consistent-return": 0,
16 "array-callback-return": 0,
17 "object-curly-newline": 0,
18 "arrow-body-style": 0,
19 "no-plusplus": 0,
20 "strict": ["error", "global"],
21 "max-len": ["error", 150],
22 "no-undef": 0,
23 "func-names": 0,
24 "import/prefer-default-export": 0,
25 "import/no-absolute-path": 0,
26 "import/no-extraneous-dependencies": ["error", { "devDependencies": ["**/test/**/*.js"] }],
27 "no-await-in-loop": 0,
28 "no-restricted-syntax": ["error", "ForInStatement", "LabeledStatement", "WithStatement"]
29 }
30}
.gitignore
1.idea
2apify_storage
3node_modules
4.nyc_output
5coverage
6src/page_function.js
7.DS_Store
Dockerfile
1FROM apify/actor-node-chrome-xvfb
2
3COPY package.json package-lock.json ./
4
5# Install default dependencies, print versions of everything
6RUN npm --quiet set progress=false \
7 && npm install --only=prod --no-optional \
8 && echo "Installed NPM packages:" \
9 && npm list || true \
10 && echo "Node.js version:" \
11 && node --version \
12 && echo "NPM version:" \
13 && npm --version
14
15COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Web Scraper Input",
3 "type": "object",
4 "description": "Use the following form to configure Web Scraper. The Start URLs and Page function are required and all other fields are optional. If you want to use Pseudo URLs, you also need to Use request queue. To learn more about the different options, click on their title or on the nearby question marks. For details about the Page function visit the actor's README in the ACTOR INFO tab. We also have a <a href=\"https://apify.com/docs/scraping/tutorial/introduction\" target=\"_blank\">step by step tutorial</a> to help you with the basics.\n\nIf you get stuck and it seems that you can no longer progress with the Web Scraper, try <a href=\"https://apify.com/docs/scraping/tutorial/puppeteer-scraper\" target=\"_blank\">Puppeteer Scraper</a>. It's a more powerful tool.",
5 "schemaVersion": 1,
6 "properties": {
7 "startUrls": {
8 "sectionCaption": "Basic configuration",
9 "title": "Start URLs",
10 "type": "array",
11 "description": "URLs to start with",
12 "prefill": [
13 { "url": "https://apify.com" }
14 ],
15 "editor": "requestListSources"
16 },
17 "useRequestQueue": {
18 "title": "Use request queue",
19 "type": "boolean",
20 "description": "Request queue enables recursive crawling and the use of Pseudo-URLs, Link selector and <code>context.enqueueRequest()</code>.",
21 "default": true
22 },
23 "pseudoUrls": {
24 "title": "Pseudo-URLs",
25 "type": "array",
26 "description": "Pseudo-URLs to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Pseudo-URLs will cause the scraper to enqueue all links matched by the Link selector.",
27 "editor": "pseudoUrls",
28 "default": [],
29 "prefill": [
30 {
31 "purl": "https://apify.com[(/[\\w-]+)?]"
32 }
33 ]
34 },
35 "linkSelector": {
36 "title": "Link selector",
37 "type": "string",
38 "description": "CSS selector matching elements with 'href' attributes that should be enqueued. To enqueue urls from <code><div class=\"my-class\" href=...></code> tags, you would enter <strong>div.my-class</strong>. Leave empty to ignore all links.",
39 "editor": "textfield",
40 "prefill": "a"
41 },
42 "keepUrlFragments": {
43 "title": "Keep URL fragments",
44 "type": "boolean",
45 "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
46 "default": false
47 },
48 "pageFunction": {
49 "title": "Page function",
50 "type": "string",
51 "description": "Function executed for each request",
52 "prefill": "async function pageFunction(context) {\n // See README for context properties. If the syntax is unfamiliar see the link\n // https://javascript.info/destructuring-assignment#object-destructuring\n const { request, log, jQuery } = context;\n\n // To be able to use jQuery as $, one needs save it into a variable\n // and select the inject jQuery option. We've selected it for you.\n const $ = jQuery;\n const title = $('title').text();\n\n // This is yet another new feature of Javascript called template strings.\n // https://javascript.info/string#quotes\n log.info(`URL: ${request.url} TITLE: ${title}`);\n\n // To save data just return an object with the requested properties.\n return {\n url: request.url,\n title\n };\n}",
53 "editor": "javascript"
54 },
55 "injectJQuery": {
56 "title": "jQuery",
57 "type": "boolean",
58 "description": "The jQuery library will be injected into each page. If the page already uses jQuery, conflicts may arise.",
59 "default": true,
60 "groupCaption": "Inject library",
61 "groupDescription": "Settings that help inject 3rd party libraries into scraper pageFunction."
62 },
63 "injectUnderscore": {
64 "title": "Underscore",
65 "type": "boolean",
66 "description": "The Underscore.js library will be injected into each page. If the page already uses Underscore.js (or other libraries that attach to '_', such as Lodash), conflicts may arise.",
67 "default": false
68 },
69 "proxyConfiguration": {
70 "sectionCaption": "Proxy and browser configuration",
71 "title": "Proxy configuration",
72 "type": "object",
73 "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
74 "prefill": { "useApifyProxy": false },
75 "default": {},
76 "editor": "proxy"
77 },
78 "initialCookies": {
79 "title": "Initial cookies",
80 "type": "array",
81 "description": "The provided cookies will be pre-set to all pages the scraper opens.",
82 "default": [],
83 "prefill": [],
84 "editor": "json"
85 },
86 "useChrome": {
87 "title": "Use Chrome",
88 "type": "boolean",
89 "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
90 "default": false,
91 "groupCaption": "Browser masking",
92 "groupDescription": "Settings that help mask as a real user and prevent scraper detection, but increase the chances of the scraper not working at all with some websites."
93 },
94 "useStealth": {
95 "title": "Use Stealth",
96 "type": "boolean",
97 "description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.",
98 "default": false
99 },
100 "ignoreSslErrors": {
101 "title": "Ignore SSL errors",
102 "type": "boolean",
103 "description": "Scraper will ignore SSL certificate errors.",
104 "default": false,
105 "groupCaption": "Security",
106 "groupDescription": "Various options that help you disable browser security features such as HTTPS certificates and CORS."
107 },
108 "ignoreCorsAndCsp": {
109 "title": "Ignore CORS and CSP",
110 "type": "boolean",
111 "description": "Scraper will ignore CSP (content security policy) and CORS (cross origin resource sharing) settings of visited pages and requested domains. This enables you to freely use XHR/Fetch to make HTTP requests from the scraper.",
112 "default": false
113 },
114 "downloadMedia": {
115 "sectionCaption": "Performance and limits",
116 "title": "Download media",
117 "type": "boolean",
118 "description": "Scraper will download media such as images, fonts, videos and sounds. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
119 "default": true,
120 "groupCaption": "Resources",
121 "groupDescription": "Settings that help to disable downloading web page resources."
122 },
123 "downloadCss": {
124 "title": "Download CSS",
125 "type": "boolean",
126 "description": "Scraper will download CSS stylesheets. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
127 "default": true
128 },
129 "maxRequestRetries": {
130 "title": "Max request retries",
131 "type": "integer",
132 "description": "Maximum number of times the request for the page will be retried in case of an error. Setting it to 0 means that the request will be attempted once and will not be retried if it fails.",
133 "minimum": 0,
134 "default": 3,
135 "unit": "retries"
136 },
137 "maxPagesPerCrawl": {
138 "title": "Max pages per run",
139 "type": "integer",
140 "description": "Maximum number of pages that the scraper will open. 0 means unlimited.",
141 "minimum": 0,
142 "default": 0,
143 "unit": "pages"
144 },
145 "maxResultsPerCrawl": {
146 "title": "Max result records",
147 "type": "integer",
148 "description": "Maximum number of results that will be saved to dataset. The scraper will terminate afterwards. 0 means unlimited.",
149 "minimum": 0,
150 "default": 0,
151 "unit": "results"
152 },
153 "maxCrawlingDepth": {
154 "title": "Max crawling depth",
155 "type": "integer",
156 "description": "Defines how many links away from the StartURLs will the scraper descend. 0 means unlimited.",
157 "minimum": 0,
158 "default": 0
159 },
160 "maxConcurrency": {
161 "title": "Max concurrency",
162 "type": "integer",
163 "description": "Defines how many pages can be processed by the scraper in parallel. The scraper automatically increases and decreases concurrency based on available system resources. Use this option to set a hard limit.",
164 "minimum": 1,
165 "default": 50
166 },
167 "pageLoadTimeoutSecs": {
168 "title": "Page load timeout",
169 "type": "integer",
170 "description": "Maximum time the scraper will allow a web page to load in seconds.",
171 "minimum": 1,
172 "default": 60,
173 "maximum": 360,
174 "unit": "secs"
175 },
176 "pageFunctionTimeoutSecs": {
177 "title": "Page function timeout",
178 "type": "integer",
179 "description": "Maximum time the scraper will wait for the page function to execute in seconds.",
180 "minimum": 1,
181 "default": 60,
182 "maximum": 360,
183 "unit": "secs"
184 },
185 "waitUntil": {
186 "title": "Navigation wait until",
187 "type": "array",
188 "description": "The scraper will wait until the selected events are triggered in the page before executing the page function. Available events are <code>domcontentloaded</code>, <code>load</code>, <code>networkidle2</code> and <code>networkidle0</code>. <a href=\"https://pptr.dev/#?product=Puppeteer&show=api-pagegotourl-options\" target=\"_blank\">See Puppeteer docs</a>.",
189 "default": ["networkidle2"],
190 "prefill": ["networkidle2"],
191 "editor": "json"
192 },
193 "debugLog": {
194 "sectionCaption": "Advanced configuration",
195 "title": "Debug log",
196 "type": "boolean",
197 "description": "Debug messages will be included in the log. Use <code>context.log.debug('message')</code> to log your own debug messages.",
198 "default": false,
199 "groupCaption": "Enable logs",
200 "groupDescription": "Logs settings"
201 },
202 "browserLog": {
203 "title": "Browser log",
204 "type": "boolean",
205 "description": "Console messages from the Browser will be included in the log. This may result in the log being flooded by error messages, warnings and other messages of little value, especially with high concurrency.",
206 "default": false
207 },
208 "chromeDebugger": {
209 "title": "Chrome debugger [experimental]",
210 "type": "boolean",
211 "description": "Experimental implementation of Chrome debugger. In this mode the scraper will run with single browser on concurrency 1. You can place <code>debugger;</code> into your page function to set up a debugger breakpoint.",
212 "default": false
213 },
214 "customData": {
215 "title": "Custom data",
216 "type": "object",
217 "description": "This object will be available on pageFunction's context as customData.",
218 "default": {},
219 "prefill": {},
220 "editor": "json"
221 }
222 },
223 "required": ["startUrls", "pageFunction"]
224}
LICENSE
1Apache License
2 Version 2.0, January 2004
3 http://www.apache.org/licenses/
4
5 TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
7 1. Definitions.
8
9 "License" shall mean the terms and conditions for use, reproduction,
10 and distribution as defined by Sections 1 through 9 of this document.
11
12 "Licensor" shall mean the copyright owner or entity authorized by
13 the copyright owner that is granting the License.
14
15 "Legal Entity" shall mean the union of the acting entity and all
16 other entities that control, are controlled by, or are under common
17 control with that entity. For the purposes of this definition,
18 "control" means (i) the power, direct or indirect, to cause the
19 direction or management of such entity, whether by contract or
20 otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 outstanding shares, or (iii) beneficial ownership of such entity.
22
23 "You" (or "Your") shall mean an individual or Legal Entity
24 exercising permissions granted by this License.
25
26 "Source" form shall mean the preferred form for making modifications,
27 including but not limited to software source code, documentation
28 source, and configuration files.
29
30 "Object" form shall mean any form resulting from mechanical
31 transformation or translation of a Source form, including but
32 not limited to compiled object code, generated documentation,
33 and conversions to other media types.
34
35 "Work" shall mean the work of authorship, whether in Source or
36 Object form, made available under the License, as indicated by a
37 copyright notice that is included in or attached to the work
38 (an example is provided in the Appendix below).
39
40 "Derivative Works" shall mean any work, whether in Source or Object
41 form, that is based on (or derived from) the Work and for which the
42 editorial revisions, annotations, elaborations, or other modifications
43 represent, as a whole, an original work of authorship. For the purposes
44 of this License, Derivative Works shall not include works that remain
45 separable from, or merely link (or bind by name) to the interfaces of,
46 the Work and Derivative Works thereof.
47
48 "Contribution" shall mean any work of authorship, including
49 the original version of the Work and any modifications or additions
50 to that Work or Derivative Works thereof, that is intentionally
51 submitted to Licensor for inclusion in the Work by the copyright owner
52 or by an individual or Legal Entity authorized to submit on behalf of
53 the copyright owner. For the purposes of this definition, "submitted"
54 means any form of electronic, verbal, or written communication sent
55 to the Licensor or its representatives, including but not limited to
56 communication on electronic mailing lists, source code control systems,
57 and issue tracking systems that are managed by, or on behalf of, the
58 Licensor for the purpose of discussing and improving the Work, but
59 excluding communication that is conspicuously marked or otherwise
60 designated in writing by the copyright owner as "Not a Contribution."
61
62 "Contributor" shall mean Licensor and any individual or Legal Entity
63 on behalf of whom a Contribution has been received by Licensor and
64 subsequently incorporated within the Work.
65
66 2. Grant of Copyright License. Subject to the terms and conditions of
67 this License, each Contributor hereby grants to You a perpetual,
68 worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 copyright license to reproduce, prepare Derivative Works of,
70 publicly display, publicly perform, sublicense, and distribute the
71 Work and such Derivative Works in Source or Object form.
72
73 3. Grant of Patent License. Subject to the terms and conditions of
74 this License, each Contributor hereby grants to You a perpetual,
75 worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 (except as stated in this section) patent license to make, have made,
77 use, offer to sell, sell, import, and otherwise transfer the Work,
78 where such license applies only to those patent claims licensable
79 by such Contributor that are necessarily infringed by their
80 Contribution(s) alone or by combination of their Contribution(s)
81 with the Work to which such Contribution(s) was submitted. If You
82 institute patent litigation against any entity (including a
83 cross-claim or counterclaim in a lawsuit) alleging that the Work
84 or a Contribution incorporated within the Work constitutes direct
85 or contributory patent infringement, then any patent licenses
86 granted to You under this License for that Work shall terminate
87 as of the date such litigation is filed.
88
89 4. Redistribution. You may reproduce and distribute copies of the
90 Work or Derivative Works thereof in any medium, with or without
91 modifications, and in Source or Object form, provided that You
92 meet the following conditions:
93
94 (a) You must give any other recipients of the Work or
95 Derivative Works a copy of this License; and
96
97 (b) You must cause any modified files to carry prominent notices
98 stating that You changed the files; and
99
100 (c) You must retain, in the Source form of any Derivative Works
101 that You distribute, all copyright, patent, trademark, and
102 attribution notices from the Source form of the Work,
103 excluding those notices that do not pertain to any part of
104 the Derivative Works; and
105
106 (d) If the Work includes a "NOTICE" text file as part of its
107 distribution, then any Derivative Works that You distribute must
108 include a readable copy of the attribution notices contained
109 within such NOTICE file, excluding those notices that do not
110 pertain to any part of the Derivative Works, in at least one
111 of the following places: within a NOTICE text file distributed
112 as part of the Derivative Works; within the Source form or
113 documentation, if provided along with the Derivative Works; or,
114 within a display generated by the Derivative Works, if and
115 wherever such third-party notices normally appear. The contents
116 of the NOTICE file are for informational purposes only and
117 do not modify the License. You may add Your own attribution
118 notices within Derivative Works that You distribute, alongside
119 or as an addendum to the NOTICE text from the Work, provided
120 that such additional attribution notices cannot be construed
121 as modifying the License.
122
123 You may add Your own copyright statement to Your modifications and
124 may provide additional or different license terms and conditions
125 for use, reproduction, or distribution of Your modifications, or
126 for any such Derivative Works as a whole, provided Your use,
127 reproduction, and distribution of the Work otherwise complies with
128 the conditions stated in this License.
129
130 5. Submission of Contributions. Unless You explicitly state otherwise,
131 any Contribution intentionally submitted for inclusion in the Work
132 by You to the Licensor shall be under the terms and conditions of
133 this License, without any additional terms or conditions.
134 Notwithstanding the above, nothing herein shall supersede or modify
135 the terms of any separate license agreement you may have executed
136 with Licensor regarding such Contributions.
137
138 6. Trademarks. This License does not grant permission to use the trade
139 names, trademarks, service marks, or product names of the Licensor,
140 except as required for reasonable and customary use in describing the
141 origin of the Work and reproducing the content of the NOTICE file.
142
143 7. Disclaimer of Warranty. Unless required by applicable law or
144 agreed to in writing, Licensor provides the Work (and each
145 Contributor provides its Contributions) on an "AS IS" BASIS,
146 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 implied, including, without limitation, any warranties or conditions
148 of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 PARTICULAR PURPOSE. You are solely responsible for determining the
150 appropriateness of using or redistributing the Work and assume any
151 risks associated with Your exercise of permissions under this License.
152
153 8. Limitation of Liability. In no event and under no legal theory,
154 whether in tort (including negligence), contract, or otherwise,
155 unless required by applicable law (such as deliberate and grossly
156 negligent acts) or agreed to in writing, shall any Contributor be
157 liable to You for damages, including any direct, indirect, special,
158 incidental, or consequential damages of any character arising as a
159 result of this License or out of the use or inability to use the
160 Work (including but not limited to damages for loss of goodwill,
161 work stoppage, computer failure or malfunction, or any and all
162 other commercial damages or losses), even if such Contributor
163 has been advised of the possibility of such damages.
164
165 9. Accepting Warranty or Additional Liability. While redistributing
166 the Work or Derivative Works thereof, You may choose to offer,
167 and charge a fee for, acceptance of support, warranty, indemnity,
168 or other liability obligations and/or rights consistent with this
169 License. However, in accepting such obligations, You may act only
170 on Your own behalf and on Your sole responsibility, not on behalf
171 of any other Contributor, and only if You agree to indemnify,
172 defend, and hold each Contributor harmless for any liability
173 incurred by, or claims asserted against, such Contributor by reason
174 of your accepting any such warranty or additional liability.
175
176 END OF TERMS AND CONDITIONS
177
178 APPENDIX: How to apply the Apache License to your work.
179
180 To apply the Apache License to your work, attach the following
181 boilerplate notice, with the fields enclosed by brackets "[]"
182 replaced with your own identifying information. (Don't include
183 the brackets!) The text should be enclosed in the appropriate
184 comment syntax for the file format. We also recommend that a
185 file or class name and description of purpose be included on the
186 same "printed page" as the copyright notice for easier
187 identification within third-party archives.
188
189 Copyright [yyyy] [name of copyright owner]
190
191 Licensed under the Apache License, Version 2.0 (the "License");
192 you may not use this file except in compliance with the License.
193 You may obtain a copy of the License at
194
195 http://www.apache.org/licenses/LICENSE-2.0
196
197 Unless required by applicable law or agreed to in writing, software
198 distributed under the License is distributed on an "AS IS" BASIS,
199 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 See the License for the specific language governing permissions and
201 limitations under the License.
apify.json
1{
2 "name": "web-scraper-experimental-debug",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null
6}
main.js
1const { runActor } = require('@apify/scraper-tools');
2const CrawlerSetup = require('./src/crawler_setup');
3
4runActor(CrawlerSetup);
package.json
1{
2 "name": "actor-web-scraper",
3 "version": "1.0.0",
4 "description": "Crawl web pages using Apify, headless Chrome and Puppeteer",
5 "main": "src/main.js",
6 "dependencies": {
7 "@apify/scraper-tools": "^0.1.2",
8 "apify": "^0.16.0",
9 "content-type": "^1.0.4",
10 "http-proxy": "^1.18.0",
11 "request-promise": "^4.2.4",
12 "underscore": "^1.9.1",
13 "url": "^0.11.0"
14 },
15 "devDependencies": {
16 "@types/puppeteer": "^1.19.0",
17 "@types/underscore": "^1.9.2",
18 "chai": "^4.2.0",
19 "eslint": "^6.1.0",
20 "eslint-config-airbnb": "^17.1.1",
21 "eslint-config-airbnb-base": "^13.2.0",
22 "eslint-plugin-import": "^2.18.2",
23 "eslint-plugin-jsx-a11y": "^6.2.3",
24 "eslint-plugin-promise": "^4.0.1",
25 "eslint-plugin-react": "^7.14.3",
26 "markdown-toc": "^1.2.0",
27 "mocha": "^6.2.0",
28 "nyc": "^14.1.1",
29 "sinon": "^7.1.1"
30 },
31 "scripts": {
32 "start": "node main.js",
33 "build-toc": "./node_modules/.bin/markdown-toc README.md -i",
34 "test": "nyc --reporter=html --reporter=text mocha --timeout 60000 --recursive",
35 "prepare": "npm run build-toc",
36 "lint": "eslint src test"
37 },
38 "repository": {
39 "type": "git",
40 "url": "https://github.com/apifytech/actor-scraper"
41 },
42 "author": {
43 "name": "Apify",
44 "email": "support@apify.com",
45 "url": "https://www.apify.com"
46 },
47 "contributors": [
48 "Marek Trunkat <marek@apify.com>",
49 "Ondra Urban <ondra@apify.com>"
50 ],
51 "license": "Apache-2.0",
52 "homepage": "https://github.com/apifytech/actor-scraper"
53}
src/bundle.browser.js
1/* istanbul ignore next */
2/**
3 * Command to be evaluated for Browser side code injection.
4 * @param apifyNamespace
5 */
6module.exports = (apifyNamespace) => {
7 (function (global, namespace) {
8 if (typeof window[namespace] !== 'object') window[namespace] = {};
9 /**
10 * Takes a configuration object with function references
11 * as an input and creates a dummy class that proxies
12 * function invocations from Browser context to Node context.
13 *
14 * The configuration is to be provided by the
15 * tools.createBrowserHandlesForObject function.
16 */
17 class NodeProxy {
18 constructor(config) {
19 if (!config || typeof config !== 'object') {
20 throw new Error('NodeProxy: Parameter config of type Object must be provided.');
21 }
22
23 Object.entries(config)
24 .forEach(([key, { value, type }]) => {
25 if (type === 'METHOD') {
26 this[key] = (...args) => global[value](...args);
27 } else if (type === 'GETTER') {
28 Object.defineProperty(this, key, {
29 get: () => global[value](),
30 });
31 } else if (type === 'VALUE') {
32 this[key] = value;
33 } else {
34 throw new Error(`Unsupported function type: ${type} for function: ${key}.`);
35 }
36 });
37 }
38 }
39
40 /**
41 * Exposed factory.
42 * @param config
43 * @return {NodeProxy}
44 */
45 global[namespace].createNodeProxy = config => new NodeProxy(config);
46
47 const setup = Symbol('crawler-setup');
48 const internalState = Symbol('request-internal-state');
49
50 /**
51 * Context represents everything that is available to the user
52 * via Page Function. A class is used instead of a simple object
53 * to avoid having to create new instances of functions with each
54 * request.
55 *
56 * Some properties need to be accessible to the Context,
57 * but should not be exposed to the user thus they are hidden
58 * using a Symbol to prevent the user from easily accessing
59 * and manipulating them.
60 *
61 * @param {Object} options
62 * @param {Object} options.crawlerSetup
63 * @param {Object} options.browserHandles
64 * @param {Object} options.pageFunctionArguments
65 */
66 class Context {
67 /* eslint-disable class-methods-use-this */
68 constructor(options) {
69 const {
70 crawlerSetup,
71 browserHandles,
72 pageFunctionArguments,
73 } = options;
74
75 const createProxy = global[namespace].createNodeProxy;
76
77 // Private
78 this[setup] = crawlerSetup;
79 this[internalState] = {
80 browserHandles,
81 requestQueue: browserHandles.requestQueue ? createProxy(browserHandles.requestQueue) : null,
82 apify: createProxy(browserHandles.apify),
83 };
84
85 // Copies of Node objects
86 this.input = JSON.parse(crawlerSetup.rawInput);
87 this.env = Object.assign({}, crawlerSetup.env);
88 this.customData = crawlerSetup.customData;
89 this.response = pageFunctionArguments.response;
90 this.request = pageFunctionArguments.request;
91 // Functions are not converted so we need to add them this way
92 // to not be enumerable and thus not polluting the object.
93 Reflect.defineProperty(this.request, 'pushErrorMessage', {
94 value(errorOrMessage) {
95 // It's a simplified fake of the original function.
96 const msg = (errorOrMessage && errorOrMessage.message) || `${errorOrMessage}`;
97 this.errorMessages.push(msg);
98 },
99 enumerable: false,
100 });
101
102 // Proxied Node objects
103 this.globalStore = createProxy(browserHandles.globalStore);
104 this.log = createProxy(browserHandles.log);
105
106 // Browser side libraries
107 if (this[setup].injectJQuery) this.jQuery = global.jQuery.noConflict(true);
108 if (this[setup].injectUnderscore) this.underscoreJs = global._.noConflict();
109
110 // Bind this to allow destructuring off context in pageFunction.
111 this.getValue = this.getValue.bind(this);
112 this.setValue = this.setValue.bind(this);
113 this.saveSnapshot = this.saveSnapshot.bind(this);
114 this.skipLinks = this.skipLinks.bind(this);
115 this.enqueueRequest = this.enqueueRequest.bind(this);
116 this.waitFor = this.waitFor.bind(this);
117 }
118
119 async getValue(...args) {
120 return this[internalState].apify.getValue(...args);
121 }
122
123 async setValue(...args) {
124 return this[internalState].apify.setValue(...args);
125 }
126
127 async saveSnapshot() {
128 const handle = this[internalState].browserHandles.saveSnapshot;
129 return global[handle]();
130 }
131
132 async skipLinks() {
133 const handle = this[internalState].browserHandles.skipLinks;
134 return global[handle]();
135 }
136
137 async enqueueRequest(requestOpts = {}, options = {}) {
138 if (!this[setup].useRequestQueue) {
139 throw new Error('Input parameter "useRequestQueue" must be set to true to be able to enqueue new requests.');
140 }
141
142 const defaultRequestOpts = {
143 useExtendedUniqueKey: true,
144 keepUrlFragment: this.input.keepUrlFragments,
145 };
146
147 const newRequest = { ...defaultRequestOpts, ...requestOpts };
148
149 const metaKey = this[setup].META_KEY;
150 const defaultUserData = {
151 [metaKey]: {
152 parentRequestId: this.request.id || this.request.uniqueKey,
153 depth: this.request.userData[metaKey].depth + 1,
154 },
155 };
156
157 newRequest.userData = { ...defaultUserData, ...requestOpts.userData };
158
159 return this[internalState].requestQueue.addRequest(newRequest, options);
160 }
161
162 async waitFor(selectorOrNumberOrFunction, options = {}) {
163 if (!options || typeof options !== 'object') throw new Error('Parameter options must be an Object');
164 const type = typeof selectorOrNumberOrFunction;
165 if (type === 'string') return this._waitForSelector(selectorOrNumberOrFunction, options);
166 if (type === 'number') return this._waitForMillis(selectorOrNumberOrFunction);
167 if (type === 'function') return this._waitForFunction(selectorOrNumberOrFunction, options);
168 throw new Error('Parameter selectorOrNumberOrFunction must be one of the said types.');
169 }
170
171 async _waitForSelector(selector, options = {}) {
172 try {
173 await this._poll(() => {
174 return !!global.document.querySelector(selector);
175 }, options);
176 } catch (err) {
177 if (/timeout of \d+ms exceeded/.test(err.message)) {
178 throw new Error(`Timeout Error: waiting for selector failed: ${err.message}`);
179 }
180 throw err;
181 }
182 }
183
184 async _waitForMillis(millis) {
185 return new Promise(res => setTimeout(res, millis));
186 }
187
188 async _waitForFunction(predicate, options = {}) {
189 try {
190 await this._poll(predicate, options);
191 } catch (err) {
192 if (/timeout of \d+ms exceeded/.test(err.message)) {
193 throw new Error(`Timeout Error: waiting for function failed: ${err.message}`);
194 }
195 throw err;
196 }
197 }
198
199 async _poll(predicate, options = {}) {
200 const {
201 pollingIntervalMillis = 50,
202 timeoutMillis = 20000,
203 } = options;
204 return new Promise((resolve, reject) => {
205 const handler = () => {
206 return predicate() ? resolve() : setTimeout(handler);
207 };
208 const pollTimeout = setTimeout(handler, pollingIntervalMillis);
209 setTimeout(() => {
210 clearTimeout(pollTimeout);
211 return reject(new Error(`timeout of ${timeoutMillis}ms exceeded.`));
212 }, timeoutMillis);
213 });
214 }
215 }
216
217 /**
218 * Exposed factory.
219 * @param {Object} options
220 * @returns {Context}
221 */
222 global[namespace].createContext = (options) => {
223 return new Context(options);
224 };
225 }(window, apifyNamespace));
226};
src/crawler_setup.js
1const Apify = require('apify');
2const _ = require('underscore');
3const httpProxy = require('http-proxy');
4const url = require('url');
5const contentType = require('content-type');
6const {
7 tools,
8 browserTools,
9 constants: { META_KEY, DEFAULT_VIEWPORT, DEVTOOLS_TIMEOUT_SECS },
10} = require('@apify/scraper-tools');
11const rp = require('request-promise');
12
13const GlobalStore = require('./global_store');
14const createBundle = require('./bundle.browser');
15const SCHEMA = require('../INPUT_SCHEMA');
16
17const CHROME_DEBUGGER_PORT = 4322;
18
19const { utils: { log, puppeteer } } = Apify;
20
21/**
22 * Replicates the INPUT_SCHEMA with JavaScript types for quick reference
23 * and IDE type check integration.
24 *
25 * @typedef {Object} Input
26 * @property {Object[]} startUrls
27 * @property {boolean} useRequestQueue
28 * @property {Object[]} pseudoUrls
29 * @property {string} linkSelector
30 * @property {boolean} keepUrlFragments
31 * @property {string} pageFunction
32 * @property {Object} proxyConfiguration
33 * @property {boolean} debugLog
34 * @property {boolean} browserLog
35 * @property {boolean} injectJQuery
36 * @property {boolean} injectUnderscore
37 * @property {boolean} downloadMedia
38 * @property {boolean} downloadCss
39 * @property {number} maxRequestRetries
40 * @property {number} maxPagesPerCrawl
41 * @property {number} maxResultsPerCrawl
42 * @property {number} maxCrawlingDepth
43 * @property {number} maxConcurrency
44 * @property {number} pageLoadTimeoutSecs
45 * @property {number} pageFunctionTimeoutSecs
46 * @property {Object} customData
47 * @property {Array} initialCookies
48 * @property {Array} waitUntil
49 * @property {boolean} useChrome
50 * @property {boolean} useStealth
51 * @property {boolean} ignoreCorsAndCsp
52 * @property {boolean} ignoreSslErrors
53 */
54
55/**
56 * Holds all the information necessary for constructing a crawler
57 * instance and creating a context for a pageFunction invocation.
58 */
59class CrawlerSetup {
60 /* eslint-disable class-methods-use-this */
61 constructor(input) {
62 this.name = 'Web Scraper';
63 // Set log level early to prevent missed messages.
64 if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
65
66 // Keep this as string to be immutable.
67 this.rawInput = JSON.stringify(input);
68
69 // Attempt to load page function from disk if not present on input.
70 tools.maybeLoadPageFunctionFromDisk(input, __dirname);
71
72 // Validate INPUT if not running on Apify Cloud Platform.
73 if (!Apify.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
74
75 /**
76 * @type {Input}
77 */
78 this.input = input;
79 this.env = Apify.getEnv();
80
81 // Validations
82 if (this.input.pseudoUrls.length && !this.input.useRequestQueue) {
83 throw new Error('Cannot enqueue links using Pseudo URLs without using a Request Queue. '
84 + 'Either select the "Use Request Queue" option to enable Request Queue or '
85 + 'remove your Pseudo URLs.');
86 }
87 this.input.pseudoUrls.forEach((purl) => {
88 if (!tools.isPlainObject(purl)) throw new Error('The pseudoUrls Array must only contain Objects.');
89 if (purl.userData && !tools.isPlainObject(purl.userData)) throw new Error('The userData property of a pseudoUrl must be an Object.');
90 });
91 this.input.initialCookies.forEach((cookie) => {
92 if (!tools.isPlainObject(cookie)) throw new Error('The initialCookies Array must only contain Objects.');
93 });
94 this.input.waitUntil.forEach((event) => {
95 if (!/^(domcontentloaded|load|networkidle2|networkidle0)$/.test(event)) {
96 throw new Error('Navigation wait until events must be valid. See tooltip.');
97 }
98 });
99 tools.evalFunctionOrThrow(this.input.pageFunction);
100
101 // Used to store page specific data.
102 this.pageContexts = new WeakMap();
103
104 // Used to store data that persist navigations
105 this.globalStore = new GlobalStore();
106
107 // Excluded resources
108 this.blockedUrlPatterns = [];
109 if (!this.input.downloadMedia) {
110 this.blockedUrlPatterns = [...this.blockedUrlPatterns,
111 '.jpg', '.jpeg', '.png', '.svg', '.gif', '.webp', '.webm', '.ico', '.woff', '.eot',
112 ];
113 }
114 if (!this.input.downloadCss) this.blockedUrlPatterns.push('.css');
115
116 // Start Chromium with Debugger any time the page function includes the keyword.
117 this.devtools = this.input.pageFunction.includes('debugger;');
118
119 // Initialize async operations.
120 this.crawler = null;
121 this.requestList = null;
122 this.requestQueue = null;
123 this.dataset = null;
124 this.keyValueStore = null;
125 this.proxy = null;
126 this.initPromise = this._initializeAsync();
127 }
128
129 async _initializeAsync() {
130 // RequestList
131 const startUrls = this.input.startUrls.map((req) => {
132 req.useExtendedUniqueKey = true;
133 req.keepUrlFragment = this.input.keepUrlFragments;
134 return req;
135 });
136 this.requestList = await Apify.openRequestList('WEB_SCRAPER', startUrls);
137
138 // RequestQueue if selected
139 if (this.input.useRequestQueue) this.requestQueue = await Apify.openRequestQueue();
140
141 // Dataset
142 this.dataset = await Apify.openDataset();
143 const { itemsCount } = await this.dataset.getInfo();
144 this.pagesOutputted = itemsCount || 0;
145
146 // KeyValueStore
147 this.keyValueStore = await Apify.openKeyValueStore();
148 }
149
150 /**
151 * Resolves to a `PuppeteerCrawler` instance.
152 * constructor.
153 * @returns {Promise<PuppeteerCrawler>}
154 */
155 async createCrawler() {
156 await this.initPromise;
157
158 const args = [];
159 if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
160
161 const options = {
162 handlePageFunction: this._handlePageFunction.bind(this),
163 requestList: this.requestList,
164 requestQueue: this.requestQueue,
165 handlePageTimeoutSecs: this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageFunctionTimeoutSecs,
166 gotoFunction: this._gotoFunction.bind(this),
167 handleFailedRequestFunction: this._handleFailedRequestFunction.bind(this),
168 maxConcurrency: this.input.maxConcurrency,
169 maxRequestRetries: this.input.maxRequestRetries,
170 maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
171 proxyUrls: this.input.proxyConfiguration.proxyUrls,
172 // launchPuppeteerFunction: use default,
173 puppeteerPoolOptions: {
174 useLiveView: true,
175 recycleDiskCache: true,
176 },
177 launchPuppeteerOptions: {
178 ...(_.omit(this.input.proxyConfiguration, 'proxyUrls')),
179 ignoreHTTPSErrors: this.input.ignoreSslErrors,
180 defaultViewport: DEFAULT_VIEWPORT,
181 devtools: this.devtools,
182 headless: false,
183 useChrome: this.input.useChrome,
184 stealth: this.input.useStealth,
185 args,
186 },
187 };
188
189 if (this.input.chromeDebugger) {
190 log.info(`Starting scraper with Chrome debugger attached:
191 - Scraper will run with concurrency 1.
192 - The whole run will be performed with a single browser (= 1 proxy IP).
193 - You can place "debugger;" to you page function to set up a break point.`);
194
195 options.puppeteerPoolOptions.useLiveView = false;
196 options.puppeteerPoolOptions.retireInstanceAfterRequestCount = 1000000;
197 options.maxConcurrency = 1;
198 options.launchPuppeteerFunction = async (launchPuppeteerOptions) => {
199 launchPuppeteerOptions.args = [`--remote-debugging-port=${CHROME_DEBUGGER_PORT}`, '--user-data-dir=remote-profile'];
200 launchPuppeteerOptions.defaultViewport = { width: 720, height: 1024 };
201
202 const browser = await Apify.launchPuppeteer(launchPuppeteerOptions);
203 const containerHost = url.parse(process.env.APIFY_CONTAINER_URL).host;
204
205 // Start HTTP proxy from live view port to Chrome debugger port.
206 if (this.proxy) proxy.close();
207 this.proxy = httpProxy.createServer({
208 target: {
209 host: 'localhost',
210 port: CHROME_DEBUGGER_PORT,
211 },
212 ws: true,
213 }).listen(4321);
214
215 // Fetches debugger URL of page we are scraping.
216 const debuggerUrlPromise = (async () => {
217 while (true) {
218 const pageList = await rp({ url: `http://localhost:${CHROME_DEBUGGER_PORT}/json/list`, json: true });
219
220 // All the internal debugger pages start with something like "devtools://"...
221 const debuggerPage = pageList.filter(page => page.url.startsWith('http')).pop();
222 if (debuggerPage) return debuggerPage;
223
224 await Apify.utils.sleep(1000);
225 }
226 })();
227
228 // Intercept requests to HTTP proxy.
229 this.proxy.on('proxyReq', async (proxyReq, req, res) => {
230 // We need Chrome to think that it's on localhost otherwise it throws an error...
231 proxyReq.setHeader('Host', 'localhost');
232
233 // URL of debugger of a page we are scraping.
234 let targetUrl = req.url;
235 if (targetUrl === '/') {
236 proxyReq.socket.pause(); // We need to pause it to be able to perform async operations.
237 targetUrl = (await debuggerUrlPromise).devtoolsFrontendUrl; // This one includes ws=localhost so redirect below will be performed
238 targetUrl = targetUrl.replace(`:${CHROME_DEBUGGER_PORT}`, '');
239 proxyReq.socket.resume();
240 }
241
242 // ...but then Chrome things it's running on localhost so we need to change websocket
243 // host in URL to be correct.
244 if (targetUrl.includes('ws=localhost')) {
245 const newUrl = targetUrl.replace('ws=localhost', `wss=${containerHost}`);
246 res.writeHead(301, { Location: newUrl });
247 res.end();
248 }
249 });
250
251 return browser;
252 };
253 }
254
255 this.crawler = new Apify.PuppeteerCrawler(options);
256
257 if (this.input.chromeDebugger) {
258 // 🤷 this.puppeteerPool is set ip in crawler.run() which is called somewhere??
259 const interval = setInterval(() => {
260 if (this.crawler.puppeteerPool) {
261 this.crawler.puppeteerPool.reusePages = true;
262 clearInterval(interval);
263 }
264 }, 10);
265 }
266
267 return this.crawler;
268 }
269
270 async _gotoFunction({ request, page }) {
271 const start = process.hrtime();
272
273 // Create a new page context with a new random key for Apify namespace.
274 const pageContext = {
275 apifyNamespace: await tools.createRandomHash(),
276 skipLinks: false,
277 };
278 this.pageContexts.set(page, pageContext);
279
280 // Attach a console listener to get all logs as soon as possible.
281 if (this.input.browserLog) browserTools.dumpConsole(page);
282
283 // Prevent download of stylesheets and media, unless selected otherwise
284 if (this.blockedUrlPatterns.length) {
285 await puppeteer.blockRequests(page, {
286 urlPatterns: this.blockedUrlPatterns,
287 });
288 }
289
290 // Add initial cookies, if any.
291 if (this.input.initialCookies.length) await page.setCookie(...this.input.initialCookies);
292
293 // Disable content security policy.
294 if (this.input.ignoreCorsAndCsp) await page.setBypassCSP(true);
295
296 tools.logPerformance(request, 'gotoFunction INIT', start);
297 const handleStart = process.hrtime();
298 pageContext.browserHandles = await this._injectBrowserHandles(page, pageContext);
299 tools.logPerformance(request, 'gotoFunction INJECTION HANDLES', handleStart);
300
301 const evalStart = process.hrtime();
302 await Promise.all([
303 page.evaluateOnNewDocument(createBundle, pageContext.apifyNamespace),
304 page.evaluateOnNewDocument(browserTools.wrapPageFunction(this.input.pageFunction, pageContext.apifyNamespace)),
305 ]);
306 tools.logPerformance(request, 'gotoFunction INJECTION EVAL', evalStart);
307
308
309 // Invoke navigation.
310 const navStart = process.hrtime();
311 const response = await puppeteer.gotoExtended(page, request, {
312 timeout: (this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageLoadTimeoutSecs) * 1000,
313 waitUntil: this.input.waitUntil,
314 });
315 await this._waitForLoadEventWhenXml(page, response);
316 tools.logPerformance(request, 'gotoFunction NAVIGATION', navStart);
317
318 const delayStart = process.hrtime();
319 await this._assertNamespace(page, pageContext.apifyNamespace);
320
321 // Inject selected libraries
322 if (this.input.injectJQuery) await puppeteer.injectJQuery(page);
323 if (this.input.injectUnderscore) await puppeteer.injectUnderscore(page);
324
325 tools.logPerformance(request, 'gotoFunction INJECTION DELAY', delayStart);
326 tools.logPerformance(request, 'gotoFunction EXECUTION', start);
327 return response;
328 }
329
330 _handleFailedRequestFunction({ request }) {
331 const lastError = request.errorMessages[request.errorMessages.length - 1];
332 const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
333 log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
334 return this._handleResult(request, {}, null, true);
335 }
336
337 /**
338 * First of all, it initializes the state that is exposed to the user via
339 * `pageFunction` context.
340 *
341 * Then it invokes the user provided `pageFunction` with the prescribed context
342 * and saves its return value.
343 *
344 * Finally, it makes decisions based on the current state and post-processes
345 * the data returned from the `pageFunction`.
346 * @param {Object} environment
347 * @returns {Function}
348 */
349 async _handlePageFunction({ request, response, page, autoscaledPool }) {
350 const start = process.hrtime();
351
352 const pageContext = this.pageContexts.get(page);
353
354 /**
355 * PRE-PROCESSING
356 */
357 // Make sure that an object containing internal metadata
358 // is present on every request.
359 tools.ensureMetaData(request);
360
361 // Abort the crawler if the maximum number of results was reached.
362 const aborted = await this._handleMaxResultsPerCrawl(autoscaledPool);
363 if (aborted) return;
364
365 // Setup Context and pass the configuration down to Browser.
366 const contextOptions = {
367 crawlerSetup: Object.assign(
368 _.pick(this, ['rawInput', 'env']),
369 _.pick(this.input, ['customData', 'useRequestQueue', 'injectJQuery', 'injectUnderscore']),
370 { META_KEY },
371 ),
372 browserHandles: pageContext.browserHandles,
373 pageFunctionArguments: {
374 request,
375 response: {
376 status: response && response.status(),
377 headers: response && response.headers(),
378 },
379 },
380 };
381
382 /**
383 * USER FUNCTION EXECUTION
384 */
385 tools.logPerformance(request, 'handlePageFunction PREPROCESSING', start);
386 const startUserFn = process.hrtime();
387
388 const namespace = pageContext.apifyNamespace;
389 const output = await page.evaluate(async (ctxOpts, namespc) => {
390 /* eslint-disable no-shadow */
391 const context = window[namespc].createContext(ctxOpts);
392 const output = {};
393 try {
394 output.pageFunctionResult = await window[namespc].pageFunction(context);
395 } catch (err) {
396 output.pageFunctionError = Object.getOwnPropertyNames(err)
397 .reduce((memo, name) => {
398 memo[name] = err[name];
399 return memo;
400 }, {});
401 }
402 // This needs to be added after pageFunction has run.
403 output.requestFromBrowser = context.request;
404
405 /**
406 * Since Dates cannot travel back to Node and Puppeteer does not use .toJSON
407 * to stringify, they come back as empty objects. We could use JSON.stringify
408 * ourselves, but that exposes us to overridden .toJSON in the target websites.
409 * This hack is not ideal, but it avoids both problems.
410 */
411 function replaceAllDatesInObjectWithISOStrings(obj) {
412 for (const [key, value] of Object.entries(obj)) {
413 if (value instanceof Date && typeof value.toISOString === 'function') {
414 obj[key] = value.toISOString();
415 } else if (value && typeof value === 'object') {
416 replaceAllDatesInObjectWithISOStrings(value);
417 }
418 }
419 return obj;
420 }
421
422 return replaceAllDatesInObjectWithISOStrings(output);
423 }, contextOptions, namespace);
424
425 tools.logPerformance(request, 'handlePageFunction USER FUNCTION', startUserFn);
426 const finishUserFn = process.hrtime();
427
428 /**
429 * POST-PROCESSING
430 */
431 const { pageFunctionResult, requestFromBrowser, pageFunctionError } = output;
432 // Merge requestFromBrowser into request to preserve modifications that
433 // may have been made in browser context.
434 Object.assign(request, requestFromBrowser);
435
436 // Throw error from pageFunction, if any.
437 if (pageFunctionError) throw tools.createError(pageFunctionError);
438
439 // Enqueue more links if a link selector is available,
440 // unless the user invoked the `skipLinks()` context function
441 // or maxCrawlingDepth would be exceeded.
442 if (!pageContext.skipLinks) {
443 await this._handleLinks(page, request);
444 }
445
446 // Save the `pageFunction`s result (or just metadata) to the default dataset.
447 await this._handleResult(request, response, pageFunctionResult);
448
449 tools.logPerformance(request, 'handlePageFunction POSTPROCESSING', finishUserFn);
450 tools.logPerformance(request, 'handlePageFunction EXECUTION', start);
451 }
452
453 async _handleMaxResultsPerCrawl(autoscaledPool) {
454 if (!this.input.maxResultsPerCrawl || this.pagesOutputted < this.input.maxResultsPerCrawl) return false;
455 log.info(`User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`);
456 await autoscaledPool.abort();
457 return true;
458 }
459
460 async _handleLinks(page, request) {
461 if (!(this.input.linkSelector && this.requestQueue)) return;
462 const start = process.hrtime();
463
464 const currentDepth = request.userData[META_KEY].depth;
465 const hasReachedMaxDepth = this.input.maxCrawlingDepth && currentDepth >= this.input.maxCrawlingDepth;
466 if (hasReachedMaxDepth) {
467 log.debug(`Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`);
468 return;
469 }
470
471 const enqueueOptions = {
472 page,
473 selector: this.input.linkSelector,
474 pseudoUrls: this.input.pseudoUrls,
475 requestQueue: this.requestQueue,
476 transformRequestFunction: (requestOptions) => {
477 requestOptions.userData = {
478 [META_KEY]: {
479 parentRequestId: request.id || request.uniqueKey,
480 depth: currentDepth + 1,
481 },
482 };
483 requestOptions.useExtendedUniqueKey = true;
484 requestOptions.keepUrlFragment = this.input.keepUrlFragments;
485 return requestOptions;
486 },
487 };
488
489 await Apify.utils.enqueueLinks(enqueueOptions);
490
491 tools.logPerformance(request, 'handleLinks EXECUTION', start);
492 }
493
494 async _handleResult(request, response, pageFunctionResult, isError) {
495 const start = process.hrtime();
496 const payload = tools.createDatasetPayload(request, response, pageFunctionResult, isError);
497 await Apify.pushData(payload);
498 this.pagesOutputted++;
499 tools.logPerformance(request, 'handleResult EXECUTION', start);
500 }
501
502 async _assertNamespace(page, namespace) {
503 try {
504 await page.waitFor(nmspc => !!window[nmspc], { timeout: this.input.pageLoadTimeoutSecs * 1000 }, namespace);
505 } catch (err) {
506 if (err.stack.startsWith('TimeoutError')) {
507 throw new Error('Injection of environment into the browser context timed out. '
508 + 'If this persists even after retries, try increasing the Page load timeout input setting.');
509 } else {
510 throw err;
511 }
512 }
513 }
514
515 async _waitForLoadEventWhenXml(page, response) {
516 // Response can sometimes be null.
517 if (!response) return;
518
519 const cTypeHeader = response.headers()['content-type'];
520 try {
521 const { type } = contentType.parse(cTypeHeader);
522 if (!/^(text|application)\/xml$|\+xml$/.test(type)) return;
523 } catch (err) {
524 // Invalid type is not XML.
525 return;
526 }
527
528 try {
529 const timeout = this.input.pageLoadTimeoutSecs * 1000;
530 await page.waitFor(() => document.readyState === 'complete', { timeout });
531 } catch (err) {
532 if (err.stack.startsWith('TimeoutError')) {
533 throw new Error('Parsing of XML in the page timed out. If you\'re expecting a large XML file, '
534 + ' such as a site map, try increasing the Page load timeout input setting.');
535 } else {
536 throw err;
537 }
538 }
539 }
540
541 async _injectBrowserHandles(page, pageContext) {
542 const saveSnapshotP = browserTools.createBrowserHandle(page, () => browserTools.saveSnapshot({ page }));
543 const skipLinksP = browserTools.createBrowserHandle(page, () => { pageContext.skipLinks = true; });
544 const globalStoreP = browserTools.createBrowserHandlesForObject(
545 page,
546 this.globalStore,
547 ['size', 'clear', 'delete', 'entries', 'get', 'has', 'keys', 'set', 'values'],
548 );
549 const logP = browserTools.createBrowserHandlesForObject(
550 page,
551 log,
552 ['LEVELS', 'setLevel', 'getLevel', 'debug', 'info', 'warning', 'error', 'exception'],
553 );
554 const apifyP = browserTools.createBrowserHandlesForObject(page, Apify, ['getValue', 'setValue']);
555 const requestQueueP = this.requestQueue
556 ? browserTools.createBrowserHandlesForObject(page, this.requestQueue, ['addRequest'])
557 : null;
558
559 const [
560 saveSnapshot,
561 skipLinks,
562 globalStore,
563 logHandle,
564 apify,
565 requestQueue,
566 ] = await Promise.all([saveSnapshotP, skipLinksP, globalStoreP, logP, apifyP, requestQueueP]);
567
568 const handles = {
569 saveSnapshot,
570 skipLinks,
571 globalStore,
572 log: logHandle,
573 apify,
574 };
575 if (requestQueue) handles.requestQueue = requestQueue;
576 return handles;
577 }
578}
579
580module.exports = CrawlerSetup;
src/global_store.js
1/**
2 * GlobalStore is a trivial storage that resembles a Map to be used from Browser contexts
3 * to retain data through page navigations and browser instances. It limits Map's functionality
4 * because it's currently impossible for functions and object references to cross Node-Browser threshold.
5 */
6class GlobalStore extends Map {
7 /* eslint-disable class-methods-use-this */
8 get(key) {
9 if (typeof key !== 'string') throw new Error('GlobalStore#get parameter "key" must be a string.');
10 return super.get(key);
11 }
12
13 set(key, value) {
14 if (typeof key !== 'string') throw new Error('GlobalStore#set parameter "key" must be a string.');
15 return super.set(key, value);
16 }
17
18 forEach() {
19 throw new Error('GlobalStore#forEach function is not available due to underlying technology limitations.');
20 }
21
22 values() {
23 return Array.from(super.values());
24 }
25
26 keys() {
27 return Array.from(super.keys());
28 }
29
30 entries() {
31 return Array.from(super.entries());
32 }
33
34 get size() {
35 return super.size;
36 }
37}
38
39module.exports = GlobalStore;
test/bundle.browser.js
1const { expect } = require('chai');
2const Apify = require('apify');
3const createBundle = require('../src/bundle.browser');
4
5const NAMESPACE = 'Apify';
6
7describe('Bundle', () => {
8 let browser;
9 let page;
10 before(async () => {
11 browser = await Apify.launchPuppeteer({ headless: true });
12 });
13 after(async () => {
14 await browser.close();
15 });
16 beforeEach(async () => {
17 page = await browser.newPage();
18 await page.evaluateOnNewDocument(createBundle, NAMESPACE);
19 });
20 afterEach(async () => {
21 await page.close();
22 });
23 describe('Context', () => {
24 const CONTEXT_OPTIONS = {
25 crawlerSetup: {
26 rawInput: '{}',
27
28 },
29 browserHandles: {
30 apify: {},
31 globalStore: {},
32 log: {},
33 },
34 pageFunctionArguments: {
35 request: {},
36 },
37 };
38 beforeEach(async () => {
39 await page.goto('about:chrome');
40 await page.waitFor(namespace => !!window[namespace], {}, NAMESPACE);
41 await page.evaluate((namespace, contextOptions) => {
42 window.contextInstance = window[namespace].createContext(contextOptions);
43 }, NAMESPACE, CONTEXT_OPTIONS);
44 });
45 describe('waitFor', async () => {
46 it('should work with a number', async () => {
47 const millis = await page.evaluate(async () => {
48 const ctx = window.contextInstance;
49 const start = Date.now();
50 await ctx.waitFor(10);
51 return Date.now() - start;
52 });
53 expect(millis).to.be.above(9);
54 });
55 it('should work with a selector', async () => {
56 const millis = await page.evaluate(async () => {
57 const ctx = window.contextInstance;
58 const start = Date.now();
59 setTimeout(() => {
60 const el = document.createElement('div');
61 el.id = 'very-unique-id';
62 document.body.appendChild(el);
63 }, 10);
64 await ctx.waitFor('#very-unique-id');
65 return Date.now() - start;
66 });
67 expect(millis).to.be.above(9);
68 });
69 it('should work with a function', async () => {
70 const millis = await page.evaluate(async () => {
71 const ctx = window.contextInstance;
72 let done = false;
73 const start = Date.now();
74 setTimeout(() => {
75 done = true;
76 }, 10);
77 await ctx.waitFor(() => done);
78 return Date.now() - start;
79 });
80 expect(millis).to.be.above(9);
81 });
82 });
83 });
84});
Developer
Maintained by Community
Actor Metrics
3 monthly users
-
3 stars
91% runs succeeded
Created in Oct 2019
Modified 4 years ago
Categories