Web Scraper Task avatar
Web Scraper Task
Try for free

No credit card required

View all Actors
Web Scraper Task

Web Scraper Task

undrtkr984/web-scraper-task
Try for free

No credit card required

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since those are the only
7# files that affect "npm install" in the next step, to speed up the build.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Web Scraper Input",
3    "type": "object",
4    "description": "Web Scraper loads <b>Start URLs</b> in the Chrome browser and executes <b>Page function</b> on each page to extract data from it. To follow links and scrape additional pages, set <b>Link selector</b> with <b>Pseudo-URLs</b> and/or <b>Glob patterns</b> to specify which links to follow. Alternatively, you can manually enqueue new links in <b>Page function</b>. For details, see actor's <a href='https://apify.com/apify/web-scraper' target='_blank' rel='noopener'>README</a> or <a href='https://apify.com/docs/scraping/tutorial/introduction' target='_blank' rel='noopener'>Web scraping tutorial</a> in the Apify documentation.",
5    "schemaVersion": 1,
6    "properties": {
7        "runMode": {
8            "sectionCaption": "Basic configuration",
9            "title": "Run mode",
10            "type": "string",
11            "description": "This property indicates the scraper's mode of operation. In DEVELOPMENT mode, the scraper ignores page timeouts, doesn't use sessionPool, opens pages one by one and enables debugging via Chrome DevTools.  Open the live view tab or the container URL to access the debugger. Further debugging options can be configured in the Advanced configuration section. PRODUCTION mode disables debugging and enables timeouts and concurrency. <br><br>For details, see <a href='https://apify.com/apify/web-scraper#run-mode' target='_blank' rel='noopener'>Run mode</a> in README.",
12            "default": "PRODUCTION",
13            "prefill": "DEVELOPMENT",
14            "editor": "select",
15            "enum": [
16                "PRODUCTION",
17                "DEVELOPMENT"
18            ]
19        },
20        "startUrls": {
21            "title": "Start URLs",
22            "type": "array",
23            "description": "A static list of URLs to scrape. <br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
24            "prefill": [{ "url": "https://crawlee.dev" }],
25            "editor": "requestListSources"
26        },
27        "keepUrlFragments": {
28            "title": "URL #fragments identify unique pages",
29            "type": "boolean",
30            "description": "Indicates that URL fragments (e.g. <code>http://example.com<b>#fragment</b></code>) should be included when checking whether a URL has already been visited or not. Typically, URL fragments are used for page navigation only and therefore they should be ignored, as they don't identify separate pages. However, some single-page websites use URL fragments to display different pages; in such a case, this option should be enabled.",
31            "default": false,
32            "groupCaption": "Options"
33        },
34        "linkSelector": {
35            "title": "Link selector",
36            "type": "string",
37            "description": "A CSS selector saying which links on the page (<code>&lt;a&gt;</code> elements with <code>href</code> attribute) shall be followed and added to the request queue. To filter the links added to the queue, use the <b>Pseudo-URLs</b> and/or <b>Glob patterns</b> setting.<br><br>If <b>Link selector</b> is empty, the page links are ignored.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#link-selector' target='_blank' rel='noopener'>Link selector</a> in README.",
38            "editor": "textfield",
39            "prefill": "a[href]"
40        },
41        "globs": {
42            "title": "Glob Patterns",
43            "type": "array",
44            "description": "Glob patterns to match links in the page that you want to enqueue. Combine with Link selector to tell the scraper where to find links. Omitting the Glob patterns will cause the scraper to enqueue all links matched by the Link selector.",
45            "editor": "globs",
46            "default": [],
47            "prefill": [{
48                "glob": "https://crawlee.dev/*/*"
49            }]
50        },
51        "pseudoUrls": {
52            "title": "Pseudo-URLs",
53            "type": "array",
54            "description": "Specifies what kind of URLs found by <b>Link selector</b> should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in <code>[]</code> brackets, e.g. <code>http://www.example.com/[.*]</code>. <br><br>If <b>Pseudo-URLs</b> are omitted, the actor enqueues all links matched by the <b>Link selector</b>.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#pseudo-urls' target='_blank' rel='noopener'>Pseudo-URLs</a> in README.",
55            "editor": "pseudoUrls",
56            "default": [],
57            "prefill": []
58        },
59        "pageFunction": {
60            "title": "Page function",
61            "type": "string",
62            "description": "JavaScript (ES6) function that is executed in the context of every page loaded in the Chrome browser. Use it to scrape data from the page, perform actions or add new URLs to the request queue.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#page-function' target='_blank' rel='noopener'>Page function</a> in README.",
63            "prefill": "// The function accepts a single argument: the \"context\" object.\n// For a complete list of its properties and functions,\n// see https://apify.com/apify/web-scraper#page-function \nasync function pageFunction(context) {\n    // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!\n    // debugger; \n\n    // jQuery is handy for finding DOM elements and extracting data from them.\n    // To use it, make sure to enable the \"Inject jQuery\" option.\n    const $ = context.jQuery;\n    const pageTitle = $('title').first().text();\n    const h1 = $('h1').first().text();\n    const first_h2 = $('h2').first().text();\n    const random_text_from_the_page = $('p').first().text();\n\n\n    // Print some information to actor log\n    context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);\n\n    // Manually add a new page to the queue for scraping.\n   await context.enqueueRequest({ url: 'http://www.example.com' });\n\n    // Return an object with the data extracted from the page.\n    // It will be stored to the resulting dataset.\n    return {\n        url: context.request.url,\n        pageTitle,\n        h1,\n        first_h2,\n        random_text_from_the_page\n    };\n}",
64            "editor": "javascript"
65        },
66        "injectJQuery": {
67            "title": "Inject jQuery",
68            "type": "boolean",
69            "description": "If enabled, the scraper will inject the <a href='http://jquery.com' target='_blank' rel='noopener'>jQuery</a> library into every web page loaded, before <b>Page function</b> is invoked. Note that the jQuery object (<code>$</code>) will not be registered into global namespace in order to avoid conflicts with libraries used by the web page. It can only be accessed through <code>context.jQuery</code> in <b>Page function</b>.",
70            "default": true
71        },
72        "proxyConfiguration": {
73            "sectionCaption": "Proxy and browser configuration",
74            "title": "Proxy configuration",
75            "type": "object",
76            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
77            "prefill": { "useApifyProxy": true },
78            "default": { "useApifyProxy": true },
79            "editor": "proxy"
80        },
81        "proxyRotation": {
82            "title": "Proxy rotation",
83            "type": "string",
84            "description": "This property indicates the strategy of proxy rotation and can only be used in conjunction with Apify Proxy. The recommended setting automatically picks the best proxies from your available pool and rotates them evenly, discarding proxies that become blocked or unresponsive. If this strategy does not work for you for any reason, you may configure the scraper to either use a new proxy for each request, or to use one proxy as long as possible, until the proxy fails. IMPORTANT: This setting will only use your available Apify Proxy pool, so if you don't have enough proxies for a given task, no rotation setting will produce satisfactory results.",
85            "default": "RECOMMENDED",
86            "editor": "select",
87            "enum": [
88                "RECOMMENDED",
89                "PER_REQUEST",
90                "UNTIL_FAILURE"
91            ],
92            "enumTitles": [
93                "Use recommended settings",
94                "Rotate proxy after each request",
95                "Use one proxy until failure"
96            ]
97        },
98        "sessionPoolName": {
99            "title": "Session pool name",
100            "type": "string",
101            "description": "<b>Use only english alphanumeric characters dashes and underscores.</b> A session is a representation of a user. It has it's own IP and cookies which are then used together to emulate a real user. Usage of the sessions is controlled by the Proxy rotation option. By providing a session pool name, you enable sharing of those sessions across multiple actor runs. This is very useful when you need specific cookies for accessing the websites or when a lot of your proxies are already blocked. Instead of trying randomly, a list of working sessions will be saved and a new actor run can reuse those sessions. Note that the IP lock on sessions expires after 24 hours, unless the session is used again in that window.",
102            "editor": "textfield",
103            "minLength": 3,
104            "maxLength": 200,
105            "pattern": "[0-9A-z-]"
106        },
107        "initialCookies": {
108            "title": "Initial cookies",
109            "type": "array",
110            "description": "A JSON array with cookies that will be set to every Chrome browser tab opened before loading the page, in the format accepted by Puppeteer's <a href='https://pptr.dev/#?product=Puppeteer&show=api-pagesetcookiecookies' target='_blank' rel='noopener'><code>Page.setCookie()</code></a> function. This option is useful for transferring a logged-in session from an external web browser. For details how to do this, read this <a href='https://help.apify.com/en/articles/1444249-log-in-to-website-by-transferring-cookies-from-web-browser-legacy' target='_blank' rel='noopener'>help article</a>.",
111            "default": [],
112            "prefill": [],
113            "editor": "json"
114        },
115        "useChrome": {
116            "title": "Use Chrome",
117            "type": "boolean",
118            "description": "If enabled, the scraper will use a real Chrome browser instead of Chromium bundled with Puppeteer. This option may help bypass certain anti-scraping protections, but might make the scraper unstable. Use at your own risk \uD83D\uDE42",
119            "default": false,
120            "groupCaption": "Browser masking"
121        },
122        "ignoreSslErrors": {
123            "title": "Ignore SSL errors",
124            "type": "boolean",
125            "description": "If enabled, the scraper will ignore SSL/TLS certificate errors. Use at your own risk.",
126            "default": false,
127            "groupCaption": "Security"
128        },
129        "ignoreCorsAndCsp": {
130            "title": "Ignore CORS and CSP",
131            "type": "boolean",
132            "description": "If enabled, the scraper will ignore Content Security Policy (CSP) and Cross-Origin Resource Sharing (CORS) settings of visited pages and requested domains. This enables you to freely use XHR/Fetch to make HTTP requests from <b>Page function</b>.",
133            "default": false
134        },
135        "downloadMedia": {
136            "sectionCaption": "Performance and limits",
137            "title": "Download media files",
138            "type": "boolean",
139            "description": "If enabled, the scraper will download media such as images, fonts, videos and sound files, as usual. Disabling this option might speed up the scrape, but certain websites could stop working correctly.",
140            "default": true,
141            "groupCaption": "Page resources"
142        },
143        "downloadCss": {
144            "title": "Download CSS files",
145            "type": "boolean",
146            "description": "If enabled, the scraper will download CSS files with stylesheets, as usual. Disabling this option may speed up the scrape, but certain websites could stop working correctly, and the live view will not look as cool.",
147            "default": true
148        },
149        "maxRequestRetries": {
150            "title": "Max page retries",
151            "type": "integer",
152            "description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by <b>Page function</b>.<br><br>If set to <code>0</code>, the page will be considered failed right after the first error.",
153            "minimum": 0,
154            "default": 3
155        },
156        "maxPagesPerCrawl": {
157            "title": "Max pages per run",
158            "type": "integer",
159            "description": "The maximum number of pages that the scraper will load. The scraper will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.<br><br>If set to <code>0</code>, there is no limit.",
160            "minimum": 0,
161            "default": 0
162        },
163        "maxResultsPerCrawl": {
164            "title": "Max result records",
165            "type": "integer",
166            "description": "The maximum number of records that will be saved to the resulting dataset. The scraper will stop when this limit is reached. <br><br>If set to <code>0</code>, there is no limit.",
167            "minimum": 0,
168            "default": 0
169        },
170        "maxCrawlingDepth": {
171            "title": "Max crawling depth",
172            "type": "integer",
173            "description": "Specifies how many links away from <b>Start URLs</b> the scraper will descend. This value is a safeguard against infinite crawling depths for misconfigured scrapers. Note that pages added using <code>context.enqueuePage()</code> in <b>Page function</b> are not subject to the maximum depth constraint. <br><br>If set to <code>0</code>, there is no limit.",
174            "minimum": 0,
175            "default": 0
176        },
177        "maxConcurrency": {
178            "title": "Max concurrency",
179            "type": "integer",
180            "description": "Specified the maximum number of pages that can be processed by the scraper in parallel. The scraper automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target web server.",
181            "minimum": 1,
182            "default": 50
183        },
184        "pageLoadTimeoutSecs": {
185            "title": "Page load timeout",
186            "type": "integer",
187            "description": "The maximum amount of time the scraper will wait for a web page to load, in seconds. If the web page does not load in this timeframe, it is considered to have failed and will be retried (subject to <b>Max page retries</b>), similarly as with other page load errors.",
188            "minimum": 1,
189            "default": 60,
190            "unit": "seconds"
191        },
192        "pageFunctionTimeoutSecs": {
193            "title": "Page function timeout",
194            "type": "integer",
195            "description": "The maximum amount of time the scraper will wait for <b>Page function</b> to execute, in seconds. It's a good idea to set this limit, to ensure that unexpected behavior in page function will not get the scraper stuck.",
196            "minimum": 1,
197            "default": 60,
198            "unit": "seconds"
199        },
200        "waitUntil": {
201            "title": "Navigation waits until",
202            "type": "array",
203            "description": "Contains a JSON array with names of page events to wait, before considering a web page fully loaded. The scraper will wait until <b>all</b> of the events are triggered in the web page before executing <b>Page function</b>. Available events are <code>domcontentloaded</code>, <code>load</code>, <code>networkidle2</code> and <code>networkidle0</code>.<br><br>For details, see <a href='https://pptr.dev/#?product=Puppeteer&show=api-pagegotourl-options' target='_blank' rel='noopener'><code>waitUntil</code> option</a> in Puppeteer's <code>Page.goto()</code> function documentation.",
204            "default": ["networkidle2"],
205            "prefill": ["networkidle2"],
206            "editor": "json"
207        },
208        "preNavigationHooks": {
209            "sectionCaption": "Advanced configuration",
210            "title": "Pre-navigation hooks",
211            "type": "string",
212            "description": "Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies or browser properties before navigation. The function accepts two parameters, `crawlingContext` and `gotoOptions`, which are passed to the `page.goto()` function the crawler calls to navigate.",
213            "prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept two arguments: the \"crawlingContext\" object\n// and \"gotoOptions\".\n[\n    async (crawlingContext, gotoOptions) => {\n        // ...\n    },\n]\n",
214            "editor": "javascript"
215        },
216        "postNavigationHooks": {
217            "title": "Post-navigation hooks",
218            "type": "string",
219            "description": "Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. The function accepts `crawlingContext` as the only parameter.",
220            "prefill": "// We need to return array of (possibly async) functions here.\n// The functions accept a single argument: the \"crawlingContext\" object.\n[\n    async (crawlingContext) => {\n        // ...\n    },\n]",
221            "editor": "javascript"
222        },
223        "breakpointLocation": {
224            "title": "Insert breakpoint",
225            "type": "string",
226            "description": "This property has no effect if Run mode is set to PRODUCTION. When set to DEVELOPMENT it inserts a breakpoint at the selected location in every page the scraper visits. Execution of code stops at the breakpoint until manually resumed in the DevTools window accessible via Live View tab or Container URL. Additional breakpoints can be added by adding <code>debugger;</code> statements within your Page function. <br><br>See <a href='https://apify.com/apify/web-scraper#run-mode' target='_blank' rel='noopener'>Run mode</a> in README for details.",
227            "default": "NONE",
228            "prefill": "NONE",
229            "editor": "select",
230            "enum": [
231                "NONE",
232                "BEFORE_GOTO",
233                "BEFORE_PAGE_FUNCTION",
234                "AFTER_PAGE_FUNCTION"
235            ],
236            "enumTitles": [
237                "Nowhere. Break only on debugger; statements",
238                "Before navigation to URL",
239                "Before Page function invocation",
240                "After Page function invocation"
241            ]
242        },
243        "debugLog": {
244            "title": "Debug log",
245            "type": "boolean",
246            "description": "If enabled, the actor log will include debug messages. Beware that this can be quite verbose. Use <code>context.log.debug('message')</code> to log your own debug messages from <b>Page function</b>.",
247            "default": false,
248            "groupCaption": "Logging"
249        },
250        "browserLog": {
251            "title": "Browser log",
252            "type": "boolean",
253            "description": "If enabled, the actor log will include console messages produced by JavaScript executed by the web pages (e.g. using <code>console.log()</code>). Beware that this may result in the log being flooded by error messages, warnings and other messages of little value, especially with high concurrency.",
254            "default": false
255        },
256        "customData": {
257            "title": "Custom data",
258            "type": "object",
259            "description": "A custom JSON object that is passed to <b>Page function</b> as <code>context.customData</code>. This setting is useful when invoking the scraper via API, in order to pass some arbitrary parameters to your code.",
260            "default": {},
261            "prefill": {},
262            "editor": "json"
263        },
264        "datasetName": {
265            "title": "Dataset name",
266            "type": "string",
267            "description": "Name or ID of the dataset that will be used for storing results. If left empty, the default dataset of the run will be used.",
268            "editor": "textfield"
269        },
270        "keyValueStoreName": {
271            "title": "Key-value store name",
272            "type": "string",
273            "description": "Name or ID of the key-value store that will be used for storing records. If left empty, the default key-value store of the run will be used.",
274            "editor": "textfield"
275        },
276        "requestQueueName": {
277            "title": "Request queue name",
278            "type": "string",
279            "description": "Name of the request queue that will be used for storing requests. If left empty, the default request queue of the run will be used.",
280            "editor": "textfield"
281        }
282    },
283    "required": ["startUrls", "pageFunction"]
284}

main.js

1// This is the main Node.js source code file of your actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7    // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file.
8    // For more information, see https://docs.apify.com/actors/development/input-schema
9    const input = await Apify.getInput();
10    console.log('Input:');
11    console.dir(input);
12
13    // Here you can prepare your input for actor apify/web-scraper this input is based on a actor
14    // task you used as the starting point.
15    const metamorphInput = {
16        "breakpointLocation": "NONE",
17        "browserLog": false,
18        "debugLog": false,
19        "downloadCss": true,
20        "downloadMedia": true,
21        "ignoreCorsAndCsp": false,
22        "ignoreSslErrors": false,
23        "injectJQuery": true,
24        "keepUrlFragments": false,
25        "maxRequestRetries": input.maxRequestRetries,
26        "pageFunction": input.pageFunction,
27        "postNavigationHooks": `// We need to return array of (possibly async) functions here.
28            // The functions accept a single argument: the "crawlingContext" object.
29            [
30                async (crawlingContext) => {
31                    // ...
32                },
33            ]`,
34        "preNavigationHooks": `// We need to return array of (possibly async) functions here.
35            // The functions accept two arguments: the "crawlingContext" object
36            // and "gotoOptions".
37            [
38                async (crawlingContext, gotoOptions) => {
39                    // ...
40                },
41            ]`,
42        "proxyConfiguration": {
43            "useApifyProxy": true,
44            "apifyProxyCountry": "US"
45        },
46        "runMode": "PRODUCTION",
47        "startUrls": input.startUrls,
48        "useChrome": true,
49        "waitUntil": [
50            "networkidle2"
51        ]
52    };
53
54    // Now let's metamorph into actor apify/web-scraper using the created input.
55    await Apify.metamorph('apify/web-scraper', metamorphInput);
56});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^2.2.2"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"
11}
Developer
Maintained by Community
Actor metrics
  • 7 monthly users
  • 99.5% runs succeeded
  • 0.0 days response time
  • Created in Jan 2023
  • Modified over 1 year ago
Categories