Web Scraper Task Malu Practice 1
Deprecated
Pricing
Pay per usage
Go to Store
Web Scraper Task Malu Practice 1
Deprecated
A test with web scraper task to an actor
0.0 (0)
Pricing
Pay per usage
1
Total users
1
Monthly users
1
Last modified
2 years ago
Dockerfile
# First, specify the base Docker image. You can read more about# the available images at https://sdk.apify.com/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:16
# Second, copy just package.json and package-lock.json since those are the only# files that affect "npm install" in the next step, to speed up the build.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Optionally, specify how to launch the source code of your Actor.# By default, Apify's base Docker images define the CMD instruction# that runs the Node.js source code using the command specified# in the "scripts.start" section of the package.json file.# In short, the instruction looks something like this:## CMD npm start
INPUT_SCHEMA.json
{ "title": "My input schema", "type": "object", "schemaVersion": 1, "properties": { "category": { "title": "Category", "type": "string", "description": "Economist.com category to be scraped", "editor": "textarea", "prefill": "briefing" } }}
main.js
1// This is the main Node.js source code file of your Actor.2// It is referenced from the "scripts" section of the package.json file.3
4const Apify = require('apify');5
6Apify.main(async () => {7 // Get input of the Actor. Input fields can be modified in INPUT_SCHEMA.json file.8 // For more information, see https://docs.apify.com/platform/actors/development/actor-definition/input-schema9 const input = await Apify.getInput();10 console.log('Input:');11 console.dir(input);12
13 // Here you can prepare your input for Actor apify/web-scraper this input is based on a actor14 // task you used as the starting point.15 const metamorphInput = {16 "breakpointLocation": "NONE",17 "browserLog": false,18 "closeCookieModals": false,19 "debugLog": false,20 "downloadCss": true,21 "downloadMedia": true,22 "excludes": [23 {24 "glob": "/**/*.{png,jpg,jpeg,pdf}"25 }26 ],27 "globs": [28 {29 "glob": "https://crawlee.dev/*/*"30 }31 ],32 "headless": true,33 "ignoreCorsAndCsp": false,34 "ignoreSslErrors": false,35 "injectJQuery": true,36 "keepUrlFragments": false,37 "linkSelector": "a",38 "pageFunction": "async function pageFunction(context) {\n // request is an instance of Apify.Request (https://sdk.apify.com/docs/api/request)\n // $ is an instance of jQuery (http://jquery.com/)\n const request = context.request;\n const $ = context.jQuery;\n const pageNum = parseInt(request.url.split('?page=').pop());\n\n context.log.info(`Scraping ${context.request.url}`);\n\n // Extract all articles.\n const articles = [];\n $('article').each((index, articleEl) => {\n const $articleEl = $(articleEl);\n\n // H3 contains 2 child elements where first one is topic and second is article title.\n const $h3El = $articleEl.find('h3');\n\n // Extract additonal info and push it to data object.\n articles.push({\n pageNum,\n topic: $h3El.children().first().text(),\n title: $h3El.children().last().text(),\n url: $articleEl.find('a')[0].href,\n teaser: $articleEl.find('.teaser__text').text(),\n });\n });\n\n // Return results.\n return articles;\n}",39 "postNavigationHooks": `// We need to return array of (possibly async) functions here.40 // The functions accept a single argument: the "crawlingContext" object.41 [42 async (crawlingContext) => {43 // ...44 },45 ]`,46 "preNavigationHooks": `// We need to return array of (possibly async) functions here.47 // The functions accept two arguments: the "crawlingContext" object48 // and "gotoOptions".49 [50 async (crawlingContext, gotoOptions) => {51 // ...52 },53 ]`,54 "proxyConfiguration": {55 "useApifyProxy": true56 },57 "startUrls": [58 {59 "url": `https://www.economist.com/${input.category}/?page=1`,60 "method": "GET"61 }62 ],63 "globs": [64 {65 "purl": `https://www.economist.com/${input.category}/?page=[\\d+]`,66 "method": "GET"67 }68 ],69 "runMode": "DEVELOPMENT",70 71 "useChrome": false,72 "waitUntil": [73 "networkidle2"74 ]75 };76
77 // Now let's metamorph into Actor apify/web-scraper using the created input.78 await Apify.metamorph('apify/web-scraper', metamorphInput);79});
package.json
{ "name": "my-actor", "version": "0.0.1", "dependencies": { "apify": "^2.2.2" }, "scripts": { "start": "node main.js" }, "author": "Me!"}