Web Scraper Experimental Debug avatar

Web Scraper Experimental Debug

Try for free

No credit card required

Go to Store
Web Scraper Experimental Debug

Web Scraper Experimental Debug

mtrunkat/web-scraper-experimental-dbgr
Try for free

No credit card required

Experimental version of Apify Web Scraper with Chrome debugger integrated

.eslintrc.json

1{
2  "extends": "airbnb-base",
3  "plugins": [
4    "import",
5    "promise"
6  ],
7  "rules": {
8    "indent": ["error", 4],
9    "no-underscore-dangle": [2, {
10      "allow": ["_id"],
11      "allowAfterThis": true
12    }],
13    "no-use-before-define": 0,
14    "no-param-reassign": 0,
15    "consistent-return": 0,
16    "array-callback-return": 0,
17    "object-curly-newline": 0,
18    "arrow-body-style": 0,
19    "no-plusplus": 0,
20    "strict": ["error", "global"],
21    "max-len": ["error", 150],
22    "no-undef": 0,
23    "func-names": 0,
24    "import/prefer-default-export": 0,
25    "import/no-absolute-path": 0,
26    "import/no-extraneous-dependencies": ["error", { "devDependencies": ["**/test/**/*.js"] }],
27    "no-await-in-loop": 0,
28    "no-restricted-syntax": ["error", "ForInStatement", "LabeledStatement", "WithStatement"]
29  }
30}

.gitignore

1.idea
2apify_storage
3node_modules
4.nyc_output
5coverage
6src/page_function.js
7.DS_Store

Dockerfile

1FROM apify/actor-node-chrome-xvfb
2
3COPY package.json package-lock.json ./
4
5# Install default dependencies, print versions of everything
6RUN npm --quiet set progress=false \
7 && npm install --only=prod --no-optional \
8 && echo "Installed NPM packages:" \
9 && npm list || true \
10 && echo "Node.js version:" \
11 && node --version \
12 && echo "NPM version:" \
13 && npm --version
14
15COPY . ./

INPUT_SCHEMA.json

1{
2    "title": "Web Scraper Input",
3    "type": "object",
4    "description": "Use the following form to configure Web Scraper. The Start URLs and Page function are required and all other fields are optional. If you want to use Pseudo URLs, you also need to Use request queue. To learn more about the different options, click on their title or on the nearby question marks. For details about the Page function visit the actor's README in the ACTOR INFO tab. We also have a <a href=\"https://apify.com/docs/scraping/tutorial/introduction\" target=\"_blank\">step by step tutorial</a> to help you with the basics.\n\nIf you get stuck and it seems that you can no longer progress with the Web Scraper, try <a href=\"https://apify.com/docs/scraping/tutorial/puppeteer-scraper\" target=\"_blank\">Puppeteer Scraper</a>. It's a more powerful tool.",
5    "schemaVersion": 1,
6    "properties": {
7        "startUrls": {
8            "sectionCaption": "Basic configuration",
9            "title": "Start URLs",
10            "type": "array",
11            "description": "URLs to start with",
12            "prefill": [
13                { "url": "https://apify.com" }
14            ],
15            "editor": "requestListSources"
16        },
17        "useRequestQueue": {
18            "title": "Use request queue",
19            "type": "boolean",
20            "description": "Request queue enables recursive crawling and the use of Pseudo-URLs, Link selector and <code>context.enqueueRequest()</code>.",
21            "default": true
22        },
23        "pseudoUrls": {
24            "title": "Pseudo-URLs",
25            "type": "array",
26            "description": "Pseudo-URLs to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Pseudo-URLs will cause the scraper to enqueue all links matched by the Link selector.",
27            "editor": "pseudoUrls",
28            "default": [],
29            "prefill": [
30                {
31                    "purl": "https://apify.com[(/[\\w-]+)?]"
32                }
33            ]
34        },
35        "linkSelector": {
36            "title": "Link selector",
37            "type": "string",
38            "description": "CSS selector matching elements with 'href' attributes that should be enqueued. To enqueue urls from <code><div class=\"my-class\" href=...></code> tags, you would enter <strong>div.my-class</strong>. Leave empty to ignore all links.",
39            "editor": "textfield",
40            "prefill": "a"
41        },
42        "keepUrlFragments": {
43            "title": "Keep URL fragments",
44            "type": "boolean",
45            "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
46            "default": false
47        },
48        "pageFunction": {
49            "title": "Page function",
50            "type": "string",
51            "description": "Function executed for each request",
52            "prefill": "async function pageFunction(context) {\n    // See README for context properties. If the syntax is unfamiliar see the link\n    // https://javascript.info/destructuring-assignment#object-destructuring\n    const { request, log, jQuery } = context;\n\n    // To be able to use jQuery as $, one needs save it into a variable\n    // and select the inject jQuery option. We've selected it for you.\n    const $ = jQuery;\n    const title = $('title').text();\n\n    // This is yet another new feature of Javascript called template strings.\n    // https://javascript.info/string#quotes\n    log.info(`URL: ${request.url} TITLE: ${title}`);\n\n    // To save data just return an object with the requested properties.\n    return {\n        url: request.url,\n        title\n    };\n}",
53            "editor": "javascript"
54        },
55        "injectJQuery": {
56            "title": "jQuery",
57            "type": "boolean",
58            "description": "The jQuery library will be injected into each page. If the page already uses jQuery, conflicts may arise.",
59            "default": true,
60            "groupCaption": "Inject library",
61            "groupDescription": "Settings that help inject 3rd party libraries into scraper pageFunction."
62        },
63        "injectUnderscore": {
64            "title": "Underscore",
65            "type": "boolean",
66            "description": "The Underscore.js library will be injected into each page. If the page already uses Underscore.js (or other libraries that attach to '_', such as Lodash), conflicts may arise.",
67            "default": false
68        },
69        "proxyConfiguration": {
70            "sectionCaption": "Proxy and browser configuration",
71            "title": "Proxy configuration",
72            "type": "object",
73            "description": "Choose to use no proxy, Apify Proxy, or provide custom proxy URLs.",
74            "prefill": { "useApifyProxy": false },
75            "default": {},
76            "editor": "proxy"
77        },
78        "initialCookies": {
79            "title": "Initial cookies",
80            "type": "array",
81            "description": "The provided cookies will be pre-set to all pages the scraper opens.",
82            "default": [],
83            "prefill": [],
84            "editor": "json"
85        },
86        "useChrome": {
87            "title": "Use Chrome",
88            "type": "boolean",
89            "description": "The scraper will use a real Chrome browser instead of a Chromium masking as Chrome. Using this option may help with bypassing certain anti-scraping protections, but risks that the scraper will be unstable or not work at all.",
90            "default": false,
91            "groupCaption": "Browser masking",
92            "groupDescription": "Settings that help mask as a real user and prevent scraper detection, but increase the chances of the scraper not working at all with some websites."
93        },
94        "useStealth": {
95            "title": "Use Stealth",
96            "type": "boolean",
97            "description": "The scraper will apply various browser emulation techniques to match a real user as closely as possible. This feature works best in conjunction with the Use Chrome option and also carries the risk of making the scraper unstable.",
98            "default": false
99        },
100        "ignoreSslErrors": {
101            "title": "Ignore SSL errors",
102            "type": "boolean",
103            "description": "Scraper will ignore SSL certificate errors.",
104            "default": false,
105            "groupCaption": "Security",
106            "groupDescription": "Various options that help you disable browser security features such as HTTPS certificates and CORS."
107        },
108        "ignoreCorsAndCsp": {
109            "title": "Ignore CORS and CSP",
110            "type": "boolean",
111            "description": "Scraper will ignore CSP (content security policy) and CORS (cross origin resource sharing) settings of visited pages and requested domains. This enables you to freely use XHR/Fetch to make HTTP requests from the scraper.",
112            "default": false
113        },
114        "downloadMedia": {
115            "sectionCaption": "Performance and limits",
116            "title": "Download media",
117            "type": "boolean",
118            "description": "Scraper will download media such as images, fonts, videos and sounds. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
119            "default": true,
120            "groupCaption": "Resources",
121            "groupDescription": "Settings that help to disable downloading web page resources."
122        },
123        "downloadCss": {
124            "title": "Download CSS",
125            "type": "boolean",
126            "description": "Scraper will download CSS stylesheets. Disabling this may speed up the scrape, but certain websites could stop working correctly.",
127            "default": true
128        },
129        "maxRequestRetries": {
130            "title": "Max request retries",
131            "type": "integer",
132            "description": "Maximum number of times the request for the page will be retried in case of an error. Setting it to 0 means that the request will be attempted once and will not be retried if it fails.",
133            "minimum": 0,
134            "default": 3,
135            "unit": "retries"
136        },
137        "maxPagesPerCrawl": {
138            "title": "Max pages per run",
139            "type": "integer",
140            "description": "Maximum number of pages that the scraper will open. 0 means unlimited.",
141            "minimum": 0,
142            "default": 0,
143            "unit": "pages"
144        },
145        "maxResultsPerCrawl": {
146            "title": "Max result records",
147            "type": "integer",
148            "description": "Maximum number of results that will be saved to dataset. The scraper will terminate afterwards. 0 means unlimited.",
149            "minimum": 0,
150            "default": 0,
151            "unit": "results"
152        },
153        "maxCrawlingDepth": {
154            "title": "Max crawling depth",
155            "type": "integer",
156            "description": "Defines how many links away from the StartURLs will the scraper descend. 0 means unlimited.",
157            "minimum": 0,
158            "default": 0
159        },
160        "maxConcurrency": {
161            "title": "Max concurrency",
162            "type": "integer",
163            "description": "Defines how many pages can be processed by the scraper in parallel. The scraper automatically increases and decreases concurrency based on available system resources. Use this option to set a hard limit.",
164            "minimum": 1,
165            "default": 50
166        },
167        "pageLoadTimeoutSecs": {
168            "title": "Page load timeout",
169            "type": "integer",
170            "description": "Maximum time the scraper will allow a web page to load in seconds.",
171            "minimum": 1,
172            "default": 60,
173            "maximum": 360,
174            "unit": "secs"
175        },
176        "pageFunctionTimeoutSecs": {
177            "title": "Page function timeout",
178            "type": "integer",
179            "description": "Maximum time the scraper will wait for the page function to execute in seconds.",
180            "minimum": 1,
181            "default": 60,
182            "maximum": 360,
183            "unit": "secs"
184        },
185        "waitUntil": {
186            "title": "Navigation wait until",
187            "type": "array",
188            "description": "The scraper will wait until the selected events are triggered in the page before executing the page function. Available events are <code>domcontentloaded</code>, <code>load</code>, <code>networkidle2</code> and <code>networkidle0</code>. <a href=\"https://pptr.dev/#?product=Puppeteer&show=api-pagegotourl-options\" target=\"_blank\">See Puppeteer docs</a>.",
189            "default": ["networkidle2"],
190            "prefill": ["networkidle2"],
191            "editor": "json"
192        },
193        "debugLog": {
194            "sectionCaption": "Advanced configuration",
195            "title": "Debug log",
196            "type": "boolean",
197            "description": "Debug messages will be included in the log. Use <code>context.log.debug('message')</code> to log your own debug messages.",
198            "default": false,
199            "groupCaption": "Enable logs",
200            "groupDescription": "Logs settings"
201        },
202        "browserLog": {
203            "title": "Browser log",
204            "type": "boolean",
205            "description": "Console messages from the Browser will be included in the log. This may result in the log being flooded by error messages, warnings and other messages of little value, especially with high concurrency.",
206            "default": false
207        },
208        "chromeDebugger": {
209            "title": "Chrome debugger [experimental]",
210            "type": "boolean",
211            "description": "Experimental implementation of Chrome debugger. In this mode the scraper will run with single browser on concurrency 1. You can place <code>debugger;</code> into your page function to set up a debugger breakpoint.",
212            "default": false
213        },
214        "customData": {
215            "title": "Custom data",
216            "type": "object",
217            "description": "This object will be available on pageFunction's context as customData.",
218            "default": {},
219            "prefill": {},
220            "editor": "json"
221        }
222    },
223    "required": ["startUrls", "pageFunction"]
224}

LICENSE

1Apache License
2                           Version 2.0, January 2004
3                        http://www.apache.org/licenses/
4
5   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
7   1. Definitions.
8
9      "License" shall mean the terms and conditions for use, reproduction,
10      and distribution as defined by Sections 1 through 9 of this document.
11
12      "Licensor" shall mean the copyright owner or entity authorized by
13      the copyright owner that is granting the License.
14
15      "Legal Entity" shall mean the union of the acting entity and all
16      other entities that control, are controlled by, or are under common
17      control with that entity. For the purposes of this definition,
18      "control" means (i) the power, direct or indirect, to cause the
19      direction or management of such entity, whether by contract or
20      otherwise, or (ii) ownership of fifty percent (50%) or more of the
21      outstanding shares, or (iii) beneficial ownership of such entity.
22
23      "You" (or "Your") shall mean an individual or Legal Entity
24      exercising permissions granted by this License.
25
26      "Source" form shall mean the preferred form for making modifications,
27      including but not limited to software source code, documentation
28      source, and configuration files.
29
30      "Object" form shall mean any form resulting from mechanical
31      transformation or translation of a Source form, including but
32      not limited to compiled object code, generated documentation,
33      and conversions to other media types.
34
35      "Work" shall mean the work of authorship, whether in Source or
36      Object form, made available under the License, as indicated by a
37      copyright notice that is included in or attached to the work
38      (an example is provided in the Appendix below).
39
40      "Derivative Works" shall mean any work, whether in Source or Object
41      form, that is based on (or derived from) the Work and for which the
42      editorial revisions, annotations, elaborations, or other modifications
43      represent, as a whole, an original work of authorship. For the purposes
44      of this License, Derivative Works shall not include works that remain
45      separable from, or merely link (or bind by name) to the interfaces of,
46      the Work and Derivative Works thereof.
47
48      "Contribution" shall mean any work of authorship, including
49      the original version of the Work and any modifications or additions
50      to that Work or Derivative Works thereof, that is intentionally
51      submitted to Licensor for inclusion in the Work by the copyright owner
52      or by an individual or Legal Entity authorized to submit on behalf of
53      the copyright owner. For the purposes of this definition, "submitted"
54      means any form of electronic, verbal, or written communication sent
55      to the Licensor or its representatives, including but not limited to
56      communication on electronic mailing lists, source code control systems,
57      and issue tracking systems that are managed by, or on behalf of, the
58      Licensor for the purpose of discussing and improving the Work, but
59      excluding communication that is conspicuously marked or otherwise
60      designated in writing by the copyright owner as "Not a Contribution."
61
62      "Contributor" shall mean Licensor and any individual or Legal Entity
63      on behalf of whom a Contribution has been received by Licensor and
64      subsequently incorporated within the Work.
65
66   2. Grant of Copyright License. Subject to the terms and conditions of
67      this License, each Contributor hereby grants to You a perpetual,
68      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69      copyright license to reproduce, prepare Derivative Works of,
70      publicly display, publicly perform, sublicense, and distribute the
71      Work and such Derivative Works in Source or Object form.
72
73   3. Grant of Patent License. Subject to the terms and conditions of
74      this License, each Contributor hereby grants to You a perpetual,
75      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76      (except as stated in this section) patent license to make, have made,
77      use, offer to sell, sell, import, and otherwise transfer the Work,
78      where such license applies only to those patent claims licensable
79      by such Contributor that are necessarily infringed by their
80      Contribution(s) alone or by combination of their Contribution(s)
81      with the Work to which such Contribution(s) was submitted. If You
82      institute patent litigation against any entity (including a
83      cross-claim or counterclaim in a lawsuit) alleging that the Work
84      or a Contribution incorporated within the Work constitutes direct
85      or contributory patent infringement, then any patent licenses
86      granted to You under this License for that Work shall terminate
87      as of the date such litigation is filed.
88
89   4. Redistribution. You may reproduce and distribute copies of the
90      Work or Derivative Works thereof in any medium, with or without
91      modifications, and in Source or Object form, provided that You
92      meet the following conditions:
93
94      (a) You must give any other recipients of the Work or
95          Derivative Works a copy of this License; and
96
97      (b) You must cause any modified files to carry prominent notices
98          stating that You changed the files; and
99
100      (c) You must retain, in the Source form of any Derivative Works
101          that You distribute, all copyright, patent, trademark, and
102          attribution notices from the Source form of the Work,
103          excluding those notices that do not pertain to any part of
104          the Derivative Works; and
105
106      (d) If the Work includes a "NOTICE" text file as part of its
107          distribution, then any Derivative Works that You distribute must
108          include a readable copy of the attribution notices contained
109          within such NOTICE file, excluding those notices that do not
110          pertain to any part of the Derivative Works, in at least one
111          of the following places: within a NOTICE text file distributed
112          as part of the Derivative Works; within the Source form or
113          documentation, if provided along with the Derivative Works; or,
114          within a display generated by the Derivative Works, if and
115          wherever such third-party notices normally appear. The contents
116          of the NOTICE file are for informational purposes only and
117          do not modify the License. You may add Your own attribution
118          notices within Derivative Works that You distribute, alongside
119          or as an addendum to the NOTICE text from the Work, provided
120          that such additional attribution notices cannot be construed
121          as modifying the License.
122
123      You may add Your own copyright statement to Your modifications and
124      may provide additional or different license terms and conditions
125      for use, reproduction, or distribution of Your modifications, or
126      for any such Derivative Works as a whole, provided Your use,
127      reproduction, and distribution of the Work otherwise complies with
128      the conditions stated in this License.
129
130   5. Submission of Contributions. Unless You explicitly state otherwise,
131      any Contribution intentionally submitted for inclusion in the Work
132      by You to the Licensor shall be under the terms and conditions of
133      this License, without any additional terms or conditions.
134      Notwithstanding the above, nothing herein shall supersede or modify
135      the terms of any separate license agreement you may have executed
136      with Licensor regarding such Contributions.
137
138   6. Trademarks. This License does not grant permission to use the trade
139      names, trademarks, service marks, or product names of the Licensor,
140      except as required for reasonable and customary use in describing the
141      origin of the Work and reproducing the content of the NOTICE file.
142
143   7. Disclaimer of Warranty. Unless required by applicable law or
144      agreed to in writing, Licensor provides the Work (and each
145      Contributor provides its Contributions) on an "AS IS" BASIS,
146      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147      implied, including, without limitation, any warranties or conditions
148      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149      PARTICULAR PURPOSE. You are solely responsible for determining the
150      appropriateness of using or redistributing the Work and assume any
151      risks associated with Your exercise of permissions under this License.
152
153   8. Limitation of Liability. In no event and under no legal theory,
154      whether in tort (including negligence), contract, or otherwise,
155      unless required by applicable law (such as deliberate and grossly
156      negligent acts) or agreed to in writing, shall any Contributor be
157      liable to You for damages, including any direct, indirect, special,
158      incidental, or consequential damages of any character arising as a
159      result of this License or out of the use or inability to use the
160      Work (including but not limited to damages for loss of goodwill,
161      work stoppage, computer failure or malfunction, or any and all
162      other commercial damages or losses), even if such Contributor
163      has been advised of the possibility of such damages.
164
165   9. Accepting Warranty or Additional Liability. While redistributing
166      the Work or Derivative Works thereof, You may choose to offer,
167      and charge a fee for, acceptance of support, warranty, indemnity,
168      or other liability obligations and/or rights consistent with this
169      License. However, in accepting such obligations, You may act only
170      on Your own behalf and on Your sole responsibility, not on behalf
171      of any other Contributor, and only if You agree to indemnify,
172      defend, and hold each Contributor harmless for any liability
173      incurred by, or claims asserted against, such Contributor by reason
174      of your accepting any such warranty or additional liability.
175
176   END OF TERMS AND CONDITIONS
177
178   APPENDIX: How to apply the Apache License to your work.
179
180      To apply the Apache License to your work, attach the following
181      boilerplate notice, with the fields enclosed by brackets "[]"
182      replaced with your own identifying information. (Don't include
183      the brackets!)  The text should be enclosed in the appropriate
184      comment syntax for the file format. We also recommend that a
185      file or class name and description of purpose be included on the
186      same "printed page" as the copyright notice for easier
187      identification within third-party archives.
188
189   Copyright [yyyy] [name of copyright owner]
190
191   Licensed under the Apache License, Version 2.0 (the "License");
192   you may not use this file except in compliance with the License.
193   You may obtain a copy of the License at
194
195       http://www.apache.org/licenses/LICENSE-2.0
196
197   Unless required by applicable law or agreed to in writing, software
198   distributed under the License is distributed on an "AS IS" BASIS,
199   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200   See the License for the specific language governing permissions and
201   limitations under the License.

apify.json

1{
2	"name": "web-scraper-experimental-debug",
3	"version": "0.1",
4	"buildTag": "latest",
5	"env": null
6}

main.js

1const { runActor } = require('@apify/scraper-tools');
2const CrawlerSetup = require('./src/crawler_setup');
3
4runActor(CrawlerSetup);

package.json

1{
2  "name": "actor-web-scraper",
3  "version": "1.0.0",
4  "description": "Crawl web pages using Apify, headless Chrome and Puppeteer",
5  "main": "src/main.js",
6  "dependencies": {
7    "@apify/scraper-tools": "^0.1.2",
8    "apify": "^0.16.0",
9    "content-type": "^1.0.4",
10    "http-proxy": "^1.18.0",
11    "request-promise": "^4.2.4",
12    "underscore": "^1.9.1",
13    "url": "^0.11.0"
14  },
15  "devDependencies": {
16    "@types/puppeteer": "^1.19.0",
17    "@types/underscore": "^1.9.2",
18    "chai": "^4.2.0",
19    "eslint": "^6.1.0",
20    "eslint-config-airbnb": "^17.1.1",
21    "eslint-config-airbnb-base": "^13.2.0",
22    "eslint-plugin-import": "^2.18.2",
23    "eslint-plugin-jsx-a11y": "^6.2.3",
24    "eslint-plugin-promise": "^4.0.1",
25    "eslint-plugin-react": "^7.14.3",
26    "markdown-toc": "^1.2.0",
27    "mocha": "^6.2.0",
28    "nyc": "^14.1.1",
29    "sinon": "^7.1.1"
30  },
31  "scripts": {
32    "start": "node main.js",
33    "build-toc": "./node_modules/.bin/markdown-toc README.md -i",
34    "test": "nyc --reporter=html --reporter=text mocha --timeout 60000 --recursive",
35    "prepare": "npm run build-toc",
36    "lint": "eslint src test"
37  },
38  "repository": {
39    "type": "git",
40    "url": "https://github.com/apifytech/actor-scraper"
41  },
42  "author": {
43    "name": "Apify",
44    "email": "support@apify.com",
45    "url": "https://www.apify.com"
46  },
47  "contributors": [
48    "Marek Trunkat <marek@apify.com>",
49    "Ondra Urban <ondra@apify.com>"
50  ],
51  "license": "Apache-2.0",
52  "homepage": "https://github.com/apifytech/actor-scraper"
53}

src/bundle.browser.js

1/* istanbul ignore next */
2/**
3 * Command to be evaluated for Browser side code injection.
4 * @param apifyNamespace
5 */
6module.exports = (apifyNamespace) => {
7    (function (global, namespace) {
8        if (typeof window[namespace] !== 'object') window[namespace] = {};
9        /**
10         * Takes a configuration object with function references
11         * as an input and creates a dummy class that proxies
12         * function invocations from Browser context to Node context.
13         *
14         * The configuration is to be provided by the
15         * tools.createBrowserHandlesForObject function.
16         */
17        class NodeProxy {
18            constructor(config) {
19                if (!config || typeof config !== 'object') {
20                    throw new Error('NodeProxy: Parameter config of type Object must be provided.');
21                }
22
23                Object.entries(config)
24                    .forEach(([key, { value, type }]) => {
25                        if (type === 'METHOD') {
26                            this[key] = (...args) => global[value](...args);
27                        } else if (type === 'GETTER') {
28                            Object.defineProperty(this, key, {
29                                get: () => global[value](),
30                            });
31                        } else if (type === 'VALUE') {
32                            this[key] = value;
33                        } else {
34                            throw new Error(`Unsupported function type: ${type} for function: ${key}.`);
35                        }
36                    });
37            }
38        }
39
40        /**
41         * Exposed factory.
42         * @param config
43         * @return {NodeProxy}
44         */
45        global[namespace].createNodeProxy = config => new NodeProxy(config);
46
47        const setup = Symbol('crawler-setup');
48        const internalState = Symbol('request-internal-state');
49
50        /**
51         * Context represents everything that is available to the user
52         * via Page Function. A class is used instead of a simple object
53         * to avoid having to create new instances of functions with each
54         * request.
55         *
56         * Some properties need to be accessible to the Context,
57         * but should not be exposed to the user thus they are hidden
58         * using a Symbol to prevent the user from easily accessing
59         * and manipulating them.
60         *
61         * @param {Object} options
62         * @param {Object} options.crawlerSetup
63         * @param {Object} options.browserHandles
64         * @param {Object} options.pageFunctionArguments
65         */
66        class Context {
67            /* eslint-disable class-methods-use-this */
68            constructor(options) {
69                const {
70                    crawlerSetup,
71                    browserHandles,
72                    pageFunctionArguments,
73                } = options;
74
75                const createProxy = global[namespace].createNodeProxy;
76
77                // Private
78                this[setup] = crawlerSetup;
79                this[internalState] = {
80                    browserHandles,
81                    requestQueue: browserHandles.requestQueue ? createProxy(browserHandles.requestQueue) : null,
82                    apify: createProxy(browserHandles.apify),
83                };
84
85                // Copies of Node objects
86                this.input = JSON.parse(crawlerSetup.rawInput);
87                this.env = Object.assign({}, crawlerSetup.env);
88                this.customData = crawlerSetup.customData;
89                this.response = pageFunctionArguments.response;
90                this.request = pageFunctionArguments.request;
91                // Functions are not converted so we need to add them this way
92                // to not be enumerable and thus not polluting the object.
93                Reflect.defineProperty(this.request, 'pushErrorMessage', {
94                    value(errorOrMessage) {
95                        // It's a simplified fake of the original function.
96                        const msg = (errorOrMessage && errorOrMessage.message) || `${errorOrMessage}`;
97                        this.errorMessages.push(msg);
98                    },
99                    enumerable: false,
100                });
101
102                // Proxied Node objects
103                this.globalStore = createProxy(browserHandles.globalStore);
104                this.log = createProxy(browserHandles.log);
105
106                // Browser side libraries
107                if (this[setup].injectJQuery) this.jQuery = global.jQuery.noConflict(true);
108                if (this[setup].injectUnderscore) this.underscoreJs = global._.noConflict();
109
110                // Bind this to allow destructuring off context in pageFunction.
111                this.getValue = this.getValue.bind(this);
112                this.setValue = this.setValue.bind(this);
113                this.saveSnapshot = this.saveSnapshot.bind(this);
114                this.skipLinks = this.skipLinks.bind(this);
115                this.enqueueRequest = this.enqueueRequest.bind(this);
116                this.waitFor = this.waitFor.bind(this);
117            }
118
119            async getValue(...args) {
120                return this[internalState].apify.getValue(...args);
121            }
122
123            async setValue(...args) {
124                return this[internalState].apify.setValue(...args);
125            }
126
127            async saveSnapshot() {
128                const handle = this[internalState].browserHandles.saveSnapshot;
129                return global[handle]();
130            }
131
132            async skipLinks() {
133                const handle = this[internalState].browserHandles.skipLinks;
134                return global[handle]();
135            }
136
137            async enqueueRequest(requestOpts = {}, options = {}) {
138                if (!this[setup].useRequestQueue) {
139                    throw new Error('Input parameter "useRequestQueue" must be set to true to be able to enqueue new requests.');
140                }
141
142                const defaultRequestOpts = {
143                    useExtendedUniqueKey: true,
144                    keepUrlFragment: this.input.keepUrlFragments,
145                };
146
147                const newRequest = { ...defaultRequestOpts, ...requestOpts };
148
149                const metaKey = this[setup].META_KEY;
150                const defaultUserData = {
151                    [metaKey]: {
152                        parentRequestId: this.request.id || this.request.uniqueKey,
153                        depth: this.request.userData[metaKey].depth + 1,
154                    },
155                };
156
157                newRequest.userData = { ...defaultUserData, ...requestOpts.userData };
158
159                return this[internalState].requestQueue.addRequest(newRequest, options);
160            }
161
162            async waitFor(selectorOrNumberOrFunction, options = {}) {
163                if (!options || typeof options !== 'object') throw new Error('Parameter options must be an Object');
164                const type = typeof selectorOrNumberOrFunction;
165                if (type === 'string') return this._waitForSelector(selectorOrNumberOrFunction, options);
166                if (type === 'number') return this._waitForMillis(selectorOrNumberOrFunction);
167                if (type === 'function') return this._waitForFunction(selectorOrNumberOrFunction, options);
168                throw new Error('Parameter selectorOrNumberOrFunction must be one of the said types.');
169            }
170
171            async _waitForSelector(selector, options = {}) {
172                try {
173                    await this._poll(() => {
174                        return !!global.document.querySelector(selector);
175                    }, options);
176                } catch (err) {
177                    if (/timeout of \d+ms exceeded/.test(err.message)) {
178                        throw new Error(`Timeout Error: waiting for selector failed: ${err.message}`);
179                    }
180                    throw err;
181                }
182            }
183
184            async _waitForMillis(millis) {
185                return new Promise(res => setTimeout(res, millis));
186            }
187
188            async _waitForFunction(predicate, options = {}) {
189                try {
190                    await this._poll(predicate, options);
191                } catch (err) {
192                    if (/timeout of \d+ms exceeded/.test(err.message)) {
193                        throw new Error(`Timeout Error: waiting for function failed: ${err.message}`);
194                    }
195                    throw err;
196                }
197            }
198
199            async _poll(predicate, options = {}) {
200                const {
201                    pollingIntervalMillis = 50,
202                    timeoutMillis = 20000,
203                } = options;
204                return new Promise((resolve, reject) => {
205                    const handler = () => {
206                        return predicate() ? resolve() : setTimeout(handler);
207                    };
208                    const pollTimeout = setTimeout(handler, pollingIntervalMillis);
209                    setTimeout(() => {
210                        clearTimeout(pollTimeout);
211                        return reject(new Error(`timeout of ${timeoutMillis}ms exceeded.`));
212                    }, timeoutMillis);
213                });
214            }
215        }
216
217        /**
218         * Exposed factory.
219         * @param {Object} options
220         * @returns {Context}
221         */
222        global[namespace].createContext = (options) => {
223            return new Context(options);
224        };
225    }(window, apifyNamespace));
226};

src/crawler_setup.js

1const Apify = require('apify');
2const _ = require('underscore');
3const httpProxy = require('http-proxy');
4const url = require('url');
5const contentType = require('content-type');
6const {
7    tools,
8    browserTools,
9    constants: { META_KEY, DEFAULT_VIEWPORT, DEVTOOLS_TIMEOUT_SECS },
10} = require('@apify/scraper-tools');
11const rp = require('request-promise');
12
13const GlobalStore = require('./global_store');
14const createBundle = require('./bundle.browser');
15const SCHEMA = require('../INPUT_SCHEMA');
16
17const CHROME_DEBUGGER_PORT = 4322;
18
19const { utils: { log, puppeteer } } = Apify;
20
21/**
22 * Replicates the INPUT_SCHEMA with JavaScript types for quick reference
23 * and IDE type check integration.
24 *
25 * @typedef {Object} Input
26 * @property {Object[]} startUrls
27 * @property {boolean} useRequestQueue
28 * @property {Object[]} pseudoUrls
29 * @property {string} linkSelector
30 * @property {boolean} keepUrlFragments
31 * @property {string} pageFunction
32 * @property {Object} proxyConfiguration
33 * @property {boolean} debugLog
34 * @property {boolean} browserLog
35 * @property {boolean} injectJQuery
36 * @property {boolean} injectUnderscore
37 * @property {boolean} downloadMedia
38 * @property {boolean} downloadCss
39 * @property {number} maxRequestRetries
40 * @property {number} maxPagesPerCrawl
41 * @property {number} maxResultsPerCrawl
42 * @property {number} maxCrawlingDepth
43 * @property {number} maxConcurrency
44 * @property {number} pageLoadTimeoutSecs
45 * @property {number} pageFunctionTimeoutSecs
46 * @property {Object} customData
47 * @property {Array} initialCookies
48 * @property {Array} waitUntil
49 * @property {boolean} useChrome
50 * @property {boolean} useStealth
51 * @property {boolean} ignoreCorsAndCsp
52 * @property {boolean} ignoreSslErrors
53 */
54
55/**
56 * Holds all the information necessary for constructing a crawler
57 * instance and creating a context for a pageFunction invocation.
58 */
59class CrawlerSetup {
60    /* eslint-disable class-methods-use-this */
61    constructor(input) {
62        this.name = 'Web Scraper';
63        // Set log level early to prevent missed messages.
64        if (input.debugLog) log.setLevel(log.LEVELS.DEBUG);
65
66        // Keep this as string to be immutable.
67        this.rawInput = JSON.stringify(input);
68
69        // Attempt to load page function from disk if not present on input.
70        tools.maybeLoadPageFunctionFromDisk(input, __dirname);
71
72        // Validate INPUT if not running on Apify Cloud Platform.
73        if (!Apify.isAtHome()) tools.checkInputOrThrow(input, SCHEMA);
74
75        /**
76         * @type {Input}
77         */
78        this.input = input;
79        this.env = Apify.getEnv();
80
81        // Validations
82        if (this.input.pseudoUrls.length && !this.input.useRequestQueue) {
83            throw new Error('Cannot enqueue links using Pseudo URLs without using a Request Queue. '
84                + 'Either select the "Use Request Queue" option to enable Request Queue or '
85                + 'remove your Pseudo URLs.');
86        }
87        this.input.pseudoUrls.forEach((purl) => {
88            if (!tools.isPlainObject(purl)) throw new Error('The pseudoUrls Array must only contain Objects.');
89            if (purl.userData && !tools.isPlainObject(purl.userData)) throw new Error('The userData property of a pseudoUrl must be an Object.');
90        });
91        this.input.initialCookies.forEach((cookie) => {
92            if (!tools.isPlainObject(cookie)) throw new Error('The initialCookies Array must only contain Objects.');
93        });
94        this.input.waitUntil.forEach((event) => {
95            if (!/^(domcontentloaded|load|networkidle2|networkidle0)$/.test(event)) {
96                throw new Error('Navigation wait until events must be valid. See tooltip.');
97            }
98        });
99        tools.evalFunctionOrThrow(this.input.pageFunction);
100
101        // Used to store page specific data.
102        this.pageContexts = new WeakMap();
103
104        // Used to store data that persist navigations
105        this.globalStore = new GlobalStore();
106
107        // Excluded resources
108        this.blockedUrlPatterns = [];
109        if (!this.input.downloadMedia) {
110            this.blockedUrlPatterns = [...this.blockedUrlPatterns,
111                '.jpg', '.jpeg', '.png', '.svg', '.gif', '.webp', '.webm', '.ico', '.woff', '.eot',
112            ];
113        }
114        if (!this.input.downloadCss) this.blockedUrlPatterns.push('.css');
115
116        // Start Chromium with Debugger any time the page function includes the keyword.
117        this.devtools = this.input.pageFunction.includes('debugger;');
118
119        // Initialize async operations.
120        this.crawler = null;
121        this.requestList = null;
122        this.requestQueue = null;
123        this.dataset = null;
124        this.keyValueStore = null;
125        this.proxy = null;
126        this.initPromise = this._initializeAsync();
127    }
128
129    async _initializeAsync() {
130        // RequestList
131        const startUrls = this.input.startUrls.map((req) => {
132            req.useExtendedUniqueKey = true;
133            req.keepUrlFragment = this.input.keepUrlFragments;
134            return req;
135        });
136        this.requestList = await Apify.openRequestList('WEB_SCRAPER', startUrls);
137
138        // RequestQueue if selected
139        if (this.input.useRequestQueue) this.requestQueue = await Apify.openRequestQueue();
140
141        // Dataset
142        this.dataset = await Apify.openDataset();
143        const { itemsCount } = await this.dataset.getInfo();
144        this.pagesOutputted = itemsCount || 0;
145
146        // KeyValueStore
147        this.keyValueStore = await Apify.openKeyValueStore();
148    }
149
150    /**
151     * Resolves to a `PuppeteerCrawler` instance.
152     * constructor.
153     * @returns {Promise<PuppeteerCrawler>}
154     */
155    async createCrawler() {
156        await this.initPromise;
157
158        const args = [];
159        if (this.input.ignoreCorsAndCsp) args.push('--disable-web-security');
160
161        const options = {
162            handlePageFunction: this._handlePageFunction.bind(this),
163            requestList: this.requestList,
164            requestQueue: this.requestQueue,
165            handlePageTimeoutSecs: this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageFunctionTimeoutSecs,
166            gotoFunction: this._gotoFunction.bind(this),
167            handleFailedRequestFunction: this._handleFailedRequestFunction.bind(this),
168            maxConcurrency: this.input.maxConcurrency,
169            maxRequestRetries: this.input.maxRequestRetries,
170            maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
171            proxyUrls: this.input.proxyConfiguration.proxyUrls,
172            // launchPuppeteerFunction: use default,
173            puppeteerPoolOptions: {
174                useLiveView: true,
175                recycleDiskCache: true,
176            },
177            launchPuppeteerOptions: {
178                ...(_.omit(this.input.proxyConfiguration, 'proxyUrls')),
179                ignoreHTTPSErrors: this.input.ignoreSslErrors,
180                defaultViewport: DEFAULT_VIEWPORT,
181                devtools: this.devtools,
182                headless: false,
183                useChrome: this.input.useChrome,
184                stealth: this.input.useStealth,
185                args,
186            },
187        };
188
189        if (this.input.chromeDebugger) {
190            log.info(`Starting scraper with Chrome debugger attached:
191            - Scraper will run with concurrency 1.
192            - The whole run will be performed with a single browser (= 1 proxy IP).
193            - You can place "debugger;" to you page function to set up a break point.`);
194
195            options.puppeteerPoolOptions.useLiveView = false;
196            options.puppeteerPoolOptions.retireInstanceAfterRequestCount = 1000000;
197            options.maxConcurrency = 1;
198            options.launchPuppeteerFunction = async (launchPuppeteerOptions) => {
199                launchPuppeteerOptions.args = [`--remote-debugging-port=${CHROME_DEBUGGER_PORT}`, '--user-data-dir=remote-profile'];
200                launchPuppeteerOptions.defaultViewport = { width: 720, height: 1024 };
201
202                const browser = await Apify.launchPuppeteer(launchPuppeteerOptions);
203                const containerHost = url.parse(process.env.APIFY_CONTAINER_URL).host;
204
205                // Start HTTP proxy from live view port to Chrome debugger port.
206                if (this.proxy) proxy.close();
207                this.proxy = httpProxy.createServer({
208                    target: {
209                        host: 'localhost',
210                        port: CHROME_DEBUGGER_PORT,
211                    },
212                    ws: true,
213                }).listen(4321);
214
215                // Fetches debugger URL of page we are scraping.
216                const debuggerUrlPromise = (async () => {
217                    while (true) {
218                        const pageList = await rp({ url: `http://localhost:${CHROME_DEBUGGER_PORT}/json/list`, json: true });
219
220                        // All the internal debugger pages start with something like "devtools://"...
221                        const debuggerPage = pageList.filter(page => page.url.startsWith('http')).pop();
222                        if (debuggerPage) return debuggerPage;
223
224                        await Apify.utils.sleep(1000);
225                    }
226                })();
227
228                // Intercept requests to HTTP proxy.
229                this.proxy.on('proxyReq', async (proxyReq, req, res) => {
230                    // We need Chrome to think that it's on localhost otherwise it throws an error...
231                    proxyReq.setHeader('Host', 'localhost');
232
233                    // URL of debugger of a page we are scraping.
234                    let targetUrl = req.url;
235                    if (targetUrl === '/') {
236                        proxyReq.socket.pause(); // We need to pause it to be able to perform async operations.
237                        targetUrl = (await debuggerUrlPromise).devtoolsFrontendUrl; // This one includes ws=localhost so redirect below will be performed
238                        targetUrl = targetUrl.replace(`:${CHROME_DEBUGGER_PORT}`, '');
239                        proxyReq.socket.resume();
240                    }
241
242                    // ...but then Chrome things it's running on localhost so we need to change websocket
243                    // host in URL to be correct.
244                    if (targetUrl.includes('ws=localhost')) {
245                        const newUrl = targetUrl.replace('ws=localhost', `wss=${containerHost}`);
246                        res.writeHead(301, { Location: newUrl });
247                        res.end();
248                    }
249                });
250
251                return browser;
252            };
253        }
254
255        this.crawler = new Apify.PuppeteerCrawler(options);
256
257        if (this.input.chromeDebugger) {
258            // 🤷 this.puppeteerPool is set ip in crawler.run() which is called somewhere??
259            const interval = setInterval(() => {
260                if (this.crawler.puppeteerPool) {
261                    this.crawler.puppeteerPool.reusePages = true;
262                    clearInterval(interval);
263                }
264            }, 10);
265        }
266
267        return this.crawler;
268    }
269
270    async _gotoFunction({ request, page }) {
271        const start = process.hrtime();
272
273        // Create a new page context with a new random key for Apify namespace.
274        const pageContext = {
275            apifyNamespace: await tools.createRandomHash(),
276            skipLinks: false,
277        };
278        this.pageContexts.set(page, pageContext);
279
280        // Attach a console listener to get all logs as soon as possible.
281        if (this.input.browserLog) browserTools.dumpConsole(page);
282
283        // Prevent download of stylesheets and media, unless selected otherwise
284        if (this.blockedUrlPatterns.length) {
285            await puppeteer.blockRequests(page, {
286                urlPatterns: this.blockedUrlPatterns,
287            });
288        }
289
290        // Add initial cookies, if any.
291        if (this.input.initialCookies.length) await page.setCookie(...this.input.initialCookies);
292
293        // Disable content security policy.
294        if (this.input.ignoreCorsAndCsp) await page.setBypassCSP(true);
295
296        tools.logPerformance(request, 'gotoFunction INIT', start);
297        const handleStart = process.hrtime();
298        pageContext.browserHandles = await this._injectBrowserHandles(page, pageContext);
299        tools.logPerformance(request, 'gotoFunction INJECTION HANDLES', handleStart);
300
301        const evalStart = process.hrtime();
302        await Promise.all([
303            page.evaluateOnNewDocument(createBundle, pageContext.apifyNamespace),
304            page.evaluateOnNewDocument(browserTools.wrapPageFunction(this.input.pageFunction, pageContext.apifyNamespace)),
305        ]);
306        tools.logPerformance(request, 'gotoFunction INJECTION EVAL', evalStart);
307
308
309        // Invoke navigation.
310        const navStart = process.hrtime();
311        const response = await puppeteer.gotoExtended(page, request, {
312            timeout: (this.devtools ? DEVTOOLS_TIMEOUT_SECS : this.input.pageLoadTimeoutSecs) * 1000,
313            waitUntil: this.input.waitUntil,
314        });
315        await this._waitForLoadEventWhenXml(page, response);
316        tools.logPerformance(request, 'gotoFunction NAVIGATION', navStart);
317
318        const delayStart = process.hrtime();
319        await this._assertNamespace(page, pageContext.apifyNamespace);
320
321        // Inject selected libraries
322        if (this.input.injectJQuery) await puppeteer.injectJQuery(page);
323        if (this.input.injectUnderscore) await puppeteer.injectUnderscore(page);
324
325        tools.logPerformance(request, 'gotoFunction INJECTION DELAY', delayStart);
326        tools.logPerformance(request, 'gotoFunction EXECUTION', start);
327        return response;
328    }
329
330    _handleFailedRequestFunction({ request }) {
331        const lastError = request.errorMessages[request.errorMessages.length - 1];
332        const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
333        log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
334        return this._handleResult(request, {}, null, true);
335    }
336
337    /**
338     * First of all, it initializes the state that is exposed to the user via
339     * `pageFunction` context.
340     *
341     * Then it invokes the user provided `pageFunction` with the prescribed context
342     * and saves its return value.
343     *
344     * Finally, it makes decisions based on the current state and post-processes
345     * the data returned from the `pageFunction`.
346     * @param {Object} environment
347     * @returns {Function}
348     */
349    async _handlePageFunction({ request, response, page, autoscaledPool }) {
350        const start = process.hrtime();
351
352        const pageContext = this.pageContexts.get(page);
353
354        /**
355         * PRE-PROCESSING
356         */
357        // Make sure that an object containing internal metadata
358        // is present on every request.
359        tools.ensureMetaData(request);
360
361        // Abort the crawler if the maximum number of results was reached.
362        const aborted = await this._handleMaxResultsPerCrawl(autoscaledPool);
363        if (aborted) return;
364
365        // Setup Context and pass the configuration down to Browser.
366        const contextOptions = {
367            crawlerSetup: Object.assign(
368                _.pick(this, ['rawInput', 'env']),
369                _.pick(this.input, ['customData', 'useRequestQueue', 'injectJQuery', 'injectUnderscore']),
370                { META_KEY },
371            ),
372            browserHandles: pageContext.browserHandles,
373            pageFunctionArguments: {
374                request,
375                response: {
376                    status: response && response.status(),
377                    headers: response && response.headers(),
378                },
379            },
380        };
381
382        /**
383         * USER FUNCTION EXECUTION
384         */
385        tools.logPerformance(request, 'handlePageFunction PREPROCESSING', start);
386        const startUserFn = process.hrtime();
387
388        const namespace = pageContext.apifyNamespace;
389        const output = await page.evaluate(async (ctxOpts, namespc) => {
390            /* eslint-disable no-shadow */
391            const context = window[namespc].createContext(ctxOpts);
392            const output = {};
393            try {
394                output.pageFunctionResult = await window[namespc].pageFunction(context);
395            } catch (err) {
396                output.pageFunctionError = Object.getOwnPropertyNames(err)
397                    .reduce((memo, name) => {
398                        memo[name] = err[name];
399                        return memo;
400                    }, {});
401            }
402            // This needs to be added after pageFunction has run.
403            output.requestFromBrowser = context.request;
404
405            /**
406             * Since Dates cannot travel back to Node and Puppeteer does not use .toJSON
407             * to stringify, they come back as empty objects. We could use JSON.stringify
408             * ourselves, but that exposes us to overridden .toJSON in the target websites.
409             * This hack is not ideal, but it avoids both problems.
410             */
411            function replaceAllDatesInObjectWithISOStrings(obj) {
412                for (const [key, value] of Object.entries(obj)) {
413                    if (value instanceof Date && typeof value.toISOString === 'function') {
414                        obj[key] = value.toISOString();
415                    } else if (value && typeof value === 'object') {
416                        replaceAllDatesInObjectWithISOStrings(value);
417                    }
418                }
419                return obj;
420            }
421
422            return replaceAllDatesInObjectWithISOStrings(output);
423        }, contextOptions, namespace);
424
425        tools.logPerformance(request, 'handlePageFunction USER FUNCTION', startUserFn);
426        const finishUserFn = process.hrtime();
427
428        /**
429         * POST-PROCESSING
430         */
431        const { pageFunctionResult, requestFromBrowser, pageFunctionError } = output;
432        // Merge requestFromBrowser into request to preserve modifications that
433        // may have been made in browser context.
434        Object.assign(request, requestFromBrowser);
435
436        // Throw error from pageFunction, if any.
437        if (pageFunctionError) throw tools.createError(pageFunctionError);
438
439        // Enqueue more links if a link selector is available,
440        // unless the user invoked the `skipLinks()` context function
441        // or maxCrawlingDepth would be exceeded.
442        if (!pageContext.skipLinks) {
443            await this._handleLinks(page, request);
444        }
445
446        // Save the `pageFunction`s result (or just metadata) to the default dataset.
447        await this._handleResult(request, response, pageFunctionResult);
448
449        tools.logPerformance(request, 'handlePageFunction POSTPROCESSING', finishUserFn);
450        tools.logPerformance(request, 'handlePageFunction EXECUTION', start);
451    }
452
453    async _handleMaxResultsPerCrawl(autoscaledPool) {
454        if (!this.input.maxResultsPerCrawl || this.pagesOutputted < this.input.maxResultsPerCrawl) return false;
455        log.info(`User set limit of ${this.input.maxResultsPerCrawl} results was reached. Finishing the crawl.`);
456        await autoscaledPool.abort();
457        return true;
458    }
459
460    async _handleLinks(page, request) {
461        if (!(this.input.linkSelector && this.requestQueue)) return;
462        const start = process.hrtime();
463
464        const currentDepth = request.userData[META_KEY].depth;
465        const hasReachedMaxDepth = this.input.maxCrawlingDepth && currentDepth >= this.input.maxCrawlingDepth;
466        if (hasReachedMaxDepth) {
467            log.debug(`Request ${request.url} reached the maximum crawling depth of ${currentDepth}.`);
468            return;
469        }
470
471        const enqueueOptions = {
472            page,
473            selector: this.input.linkSelector,
474            pseudoUrls: this.input.pseudoUrls,
475            requestQueue: this.requestQueue,
476            transformRequestFunction: (requestOptions) => {
477                requestOptions.userData = {
478                    [META_KEY]: {
479                        parentRequestId: request.id || request.uniqueKey,
480                        depth: currentDepth + 1,
481                    },
482                };
483                requestOptions.useExtendedUniqueKey = true;
484                requestOptions.keepUrlFragment = this.input.keepUrlFragments;
485                return requestOptions;
486            },
487        };
488
489        await Apify.utils.enqueueLinks(enqueueOptions);
490
491        tools.logPerformance(request, 'handleLinks EXECUTION', start);
492    }
493
494    async _handleResult(request, response, pageFunctionResult, isError) {
495        const start = process.hrtime();
496        const payload = tools.createDatasetPayload(request, response, pageFunctionResult, isError);
497        await Apify.pushData(payload);
498        this.pagesOutputted++;
499        tools.logPerformance(request, 'handleResult EXECUTION', start);
500    }
501
502    async _assertNamespace(page, namespace) {
503        try {
504            await page.waitFor(nmspc => !!window[nmspc], { timeout: this.input.pageLoadTimeoutSecs * 1000 }, namespace);
505        } catch (err) {
506            if (err.stack.startsWith('TimeoutError')) {
507                throw new Error('Injection of environment into the browser context timed out. '
508                    + 'If this persists even after retries, try increasing the Page load timeout input setting.');
509            } else {
510                throw err;
511            }
512        }
513    }
514
515    async _waitForLoadEventWhenXml(page, response) {
516        // Response can sometimes be null.
517        if (!response) return;
518
519        const cTypeHeader = response.headers()['content-type'];
520        try {
521            const { type } = contentType.parse(cTypeHeader);
522            if (!/^(text|application)\/xml$|\+xml$/.test(type)) return;
523        } catch (err) {
524            // Invalid type is not XML.
525            return;
526        }
527
528        try {
529            const timeout = this.input.pageLoadTimeoutSecs * 1000;
530            await page.waitFor(() => document.readyState === 'complete', { timeout });
531        } catch (err) {
532            if (err.stack.startsWith('TimeoutError')) {
533                throw new Error('Parsing of XML in the page timed out. If you\'re expecting a large XML file, '
534                    + ' such as a site map, try increasing the Page load timeout input setting.');
535            } else {
536                throw err;
537            }
538        }
539    }
540
541    async _injectBrowserHandles(page, pageContext) {
542        const saveSnapshotP = browserTools.createBrowserHandle(page, () => browserTools.saveSnapshot({ page }));
543        const skipLinksP = browserTools.createBrowserHandle(page, () => { pageContext.skipLinks = true; });
544        const globalStoreP = browserTools.createBrowserHandlesForObject(
545            page,
546            this.globalStore,
547            ['size', 'clear', 'delete', 'entries', 'get', 'has', 'keys', 'set', 'values'],
548        );
549        const logP = browserTools.createBrowserHandlesForObject(
550            page,
551            log,
552            ['LEVELS', 'setLevel', 'getLevel', 'debug', 'info', 'warning', 'error', 'exception'],
553        );
554        const apifyP = browserTools.createBrowserHandlesForObject(page, Apify, ['getValue', 'setValue']);
555        const requestQueueP = this.requestQueue
556            ? browserTools.createBrowserHandlesForObject(page, this.requestQueue, ['addRequest'])
557            : null;
558
559        const [
560            saveSnapshot,
561            skipLinks,
562            globalStore,
563            logHandle,
564            apify,
565            requestQueue,
566        ] = await Promise.all([saveSnapshotP, skipLinksP, globalStoreP, logP, apifyP, requestQueueP]);
567
568        const handles = {
569            saveSnapshot,
570            skipLinks,
571            globalStore,
572            log: logHandle,
573            apify,
574        };
575        if (requestQueue) handles.requestQueue = requestQueue;
576        return handles;
577    }
578}
579
580module.exports = CrawlerSetup;

src/global_store.js

1/**
2 * GlobalStore is a trivial storage that resembles a Map to be used from Browser contexts
3 * to retain data through page navigations and browser instances. It limits Map's functionality
4 * because it's currently impossible for functions and object references to cross Node-Browser threshold.
5 */
6class GlobalStore extends Map {
7    /* eslint-disable class-methods-use-this */
8    get(key) {
9        if (typeof key !== 'string') throw new Error('GlobalStore#get parameter "key" must be a string.');
10        return super.get(key);
11    }
12
13    set(key, value) {
14        if (typeof key !== 'string') throw new Error('GlobalStore#set parameter "key" must be a string.');
15        return super.set(key, value);
16    }
17
18    forEach() {
19        throw new Error('GlobalStore#forEach function is not available due to underlying technology limitations.');
20    }
21
22    values() {
23        return Array.from(super.values());
24    }
25
26    keys() {
27        return Array.from(super.keys());
28    }
29
30    entries() {
31        return Array.from(super.entries());
32    }
33
34    get size() {
35        return super.size;
36    }
37}
38
39module.exports = GlobalStore;

test/bundle.browser.js

1const { expect } = require('chai');
2const Apify = require('apify');
3const createBundle = require('../src/bundle.browser');
4
5const NAMESPACE = 'Apify';
6
7describe('Bundle', () => {
8    let browser;
9    let page;
10    before(async () => {
11        browser = await Apify.launchPuppeteer({ headless: true });
12    });
13    after(async () => {
14        await browser.close();
15    });
16    beforeEach(async () => {
17        page = await browser.newPage();
18        await page.evaluateOnNewDocument(createBundle, NAMESPACE);
19    });
20    afterEach(async () => {
21        await page.close();
22    });
23    describe('Context', () => {
24        const CONTEXT_OPTIONS = {
25            crawlerSetup: {
26                rawInput: '{}',
27
28            },
29            browserHandles: {
30                apify: {},
31                globalStore: {},
32                log: {},
33            },
34            pageFunctionArguments: {
35                request: {},
36            },
37        };
38        beforeEach(async () => {
39            await page.goto('about:chrome');
40            await page.waitFor(namespace => !!window[namespace], {}, NAMESPACE);
41            await page.evaluate((namespace, contextOptions) => {
42                window.contextInstance = window[namespace].createContext(contextOptions);
43            }, NAMESPACE, CONTEXT_OPTIONS);
44        });
45        describe('waitFor', async () => {
46            it('should work with a number', async () => {
47                const millis = await page.evaluate(async () => {
48                    const ctx = window.contextInstance;
49                    const start = Date.now();
50                    await ctx.waitFor(10);
51                    return Date.now() - start;
52                });
53                expect(millis).to.be.above(9);
54            });
55            it('should work with a selector', async () => {
56                const millis = await page.evaluate(async () => {
57                    const ctx = window.contextInstance;
58                    const start = Date.now();
59                    setTimeout(() => {
60                        const el = document.createElement('div');
61                        el.id = 'very-unique-id';
62                        document.body.appendChild(el);
63                    }, 10);
64                    await ctx.waitFor('#very-unique-id');
65                    return Date.now() - start;
66                });
67                expect(millis).to.be.above(9);
68            });
69            it('should work with a function', async () => {
70                const millis = await page.evaluate(async () => {
71                    const ctx = window.contextInstance;
72                    let done = false;
73                    const start = Date.now();
74                    setTimeout(() => {
75                        done = true;
76                    }, 10);
77                    await ctx.waitFor(() => done);
78                    return Date.now() - start;
79                });
80                expect(millis).to.be.above(9);
81            });
82        });
83    });
84});
Developer
Maintained by Community

Actor Metrics

  • 3 monthly users

  • 3 stars

  • 91% runs succeeded

  • Created in Oct 2019

  • Modified 4 years ago

Categories