Web Scraper Task Malu Practice 1 avatar
Web Scraper Task Malu Practice 1

Deprecated

Pricing

Pay per usage

Go to Store
Web Scraper Task Malu Practice 1

Web Scraper Task Malu Practice 1

Deprecated

Developed by

Malu Alvarado

Malu Alvarado

Maintained by Community

A test with web scraper task to an actor

0.0 (0)

Pricing

Pay per usage

1

Total users

1

Monthly users

1

Last modified

2 years ago

Dockerfile

# First, specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16
# Second, copy just package.json and package-lock.json since those are the only
# files that affect "npm install" in the next step, to speed up the build.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Optionally, specify how to launch the source code of your Actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

{
"title": "My input schema",
"type": "object",
"schemaVersion": 1,
"properties": {
"category": {
"title": "Category",
"type": "string",
"description": "Economist.com category to be scraped",
"editor": "textarea",
"prefill": "briefing"
}
}
}

main.js

1// This is the main Node.js source code file of your Actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7 // Get input of the Actor. Input fields can be modified in INPUT_SCHEMA.json file.
8 // For more information, see https://docs.apify.com/platform/actors/development/actor-definition/input-schema
9 const input = await Apify.getInput();
10 console.log('Input:');
11 console.dir(input);
12
13 // Here you can prepare your input for Actor apify/web-scraper this input is based on a actor
14 // task you used as the starting point.
15 const metamorphInput = {
16 "breakpointLocation": "NONE",
17 "browserLog": false,
18 "closeCookieModals": false,
19 "debugLog": false,
20 "downloadCss": true,
21 "downloadMedia": true,
22 "excludes": [
23 {
24 "glob": "/**/*.{png,jpg,jpeg,pdf}"
25 }
26 ],
27 "globs": [
28 {
29 "glob": "https://crawlee.dev/*/*"
30 }
31 ],
32 "headless": true,
33 "ignoreCorsAndCsp": false,
34 "ignoreSslErrors": false,
35 "injectJQuery": true,
36 "keepUrlFragments": false,
37 "linkSelector": "a",
38 "pageFunction": "async function pageFunction(context) {\n // request is an instance of Apify.Request (https://sdk.apify.com/docs/api/request)\n // $ is an instance of jQuery (http://jquery.com/)\n const request = context.request;\n const $ = context.jQuery;\n const pageNum = parseInt(request.url.split('?page=').pop());\n\n context.log.info(`Scraping ${context.request.url}`);\n\n // Extract all articles.\n const articles = [];\n $('article').each((index, articleEl) => {\n const $articleEl = $(articleEl);\n\n // H3 contains 2 child elements where first one is topic and second is article title.\n const $h3El = $articleEl.find('h3');\n\n // Extract additonal info and push it to data object.\n articles.push({\n pageNum,\n topic: $h3El.children().first().text(),\n title: $h3El.children().last().text(),\n url: $articleEl.find('a')[0].href,\n teaser: $articleEl.find('.teaser__text').text(),\n });\n });\n\n // Return results.\n return articles;\n}",
39 "postNavigationHooks": `// We need to return array of (possibly async) functions here.
40 // The functions accept a single argument: the "crawlingContext" object.
41 [
42 async (crawlingContext) => {
43 // ...
44 },
45 ]`,
46 "preNavigationHooks": `// We need to return array of (possibly async) functions here.
47 // The functions accept two arguments: the "crawlingContext" object
48 // and "gotoOptions".
49 [
50 async (crawlingContext, gotoOptions) => {
51 // ...
52 },
53 ]`,
54 "proxyConfiguration": {
55 "useApifyProxy": true
56 },
57 "startUrls": [
58 {
59 "url": `https://www.economist.com/${input.category}/?page=1`,
60 "method": "GET"
61 }
62 ],
63 "globs": [
64 {
65 "purl": `https://www.economist.com/${input.category}/?page=[\\d+]`,
66 "method": "GET"
67 }
68 ],
69 "runMode": "DEVELOPMENT",
70
71 "useChrome": false,
72 "waitUntil": [
73 "networkidle2"
74 ]
75 };
76
77 // Now let's metamorph into Actor apify/web-scraper using the created input.
78 await Apify.metamorph('apify/web-scraper', metamorphInput);
79});

package.json

{
"name": "my-actor",
"version": "0.0.1",
"dependencies": {
"apify": "^2.2.2"
},
"scripts": {
"start": "node main.js"
},
"author": "Me!"
}