Web Scraper Task Malu Practice 1 avatar
Web Scraper Task Malu Practice 1
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Web Scraper Task Malu Practice 1

Web Scraper Task Malu Practice 1

malu_alvarado/web-scraper-task-malu-test

A test with web scraper task to an actor

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since those are the only
7# files that affect "npm install" in the next step, to speed up the build.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your Actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "My input schema",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "category": {
7            "title": "Category",
8            "type": "string",
9            "description": "Economist.com category to be scraped",
10            "editor": "textarea",
11            "prefill": "briefing"
12        }
13    }
14}

main.js

1// This is the main Node.js source code file of your Actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7    // Get input of the Actor. Input fields can be modified in INPUT_SCHEMA.json file.
8    // For more information, see https://docs.apify.com/platform/actors/development/actor-definition/input-schema
9    const input = await Apify.getInput();
10    console.log('Input:');
11    console.dir(input);
12
13    // Here you can prepare your input for Actor apify/web-scraper this input is based on a actor
14    // task you used as the starting point.
15    const metamorphInput = {
16        "breakpointLocation": "NONE",
17        "browserLog": false,
18        "closeCookieModals": false,
19        "debugLog": false,
20        "downloadCss": true,
21        "downloadMedia": true,
22        "excludes": [
23            {
24                "glob": "/**/*.{png,jpg,jpeg,pdf}"
25            }
26        ],
27        "globs": [
28            {
29                "glob": "https://crawlee.dev/*/*"
30            }
31        ],
32        "headless": true,
33        "ignoreCorsAndCsp": false,
34        "ignoreSslErrors": false,
35        "injectJQuery": true,
36        "keepUrlFragments": false,
37        "linkSelector": "a",
38        "pageFunction": "async function pageFunction(context) {\n    // request is an instance of Apify.Request (https://sdk.apify.com/docs/api/request)\n    // $ is an instance of jQuery (http://jquery.com/)\n    const request = context.request;\n    const $ = context.jQuery;\n    const pageNum = parseInt(request.url.split('?page=').pop());\n\n  context.log.info(`Scraping ${context.request.url}`);\n\n    // Extract all articles.\n    const articles = [];\n    $('article').each((index, articleEl) => {\n        const $articleEl = $(articleEl);\n\n        // H3 contains 2 child elements where first one is topic and second is article title.\n        const $h3El = $articleEl.find('h3');\n\n        // Extract additonal info and push it to data object.\n        articles.push({\n            pageNum,\n            topic: $h3El.children().first().text(),\n            title: $h3El.children().last().text(),\n            url: $articleEl.find('a')[0].href,\n            teaser: $articleEl.find('.teaser__text').text(),\n        });\n    });\n\n    // Return results.\n    return articles;\n}",
39        "postNavigationHooks": `// We need to return array of (possibly async) functions here.
40            // The functions accept a single argument: the "crawlingContext" object.
41            [
42                async (crawlingContext) => {
43                    // ...
44                },
45            ]`,
46        "preNavigationHooks": `// We need to return array of (possibly async) functions here.
47            // The functions accept two arguments: the "crawlingContext" object
48            // and "gotoOptions".
49            [
50                async (crawlingContext, gotoOptions) => {
51                    // ...
52                },
53            ]`,
54        "proxyConfiguration": {
55            "useApifyProxy": true
56        },
57        "startUrls": [
58        {
59        "url": `https://www.economist.com/${input.category}/?page=1`,
60        "method": "GET"
61        }
62        ],
63        "globs": [
64         {
65        "purl": `https://www.economist.com/${input.category}/?page=[\\d+]`,
66        "method": "GET"
67        }
68        ],
69        "runMode": "DEVELOPMENT",
70        
71        "useChrome": false,
72        "waitUntil": [
73            "networkidle2"
74        ]
75    };
76
77    // Now let's metamorph into Actor apify/web-scraper using the created input.
78    await Apify.metamorph('apify/web-scraper', metamorphInput);
79});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^2.2.2"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"
11}
Developer
Maintained by Community
Categories