
Metadata Extractor
Pricing
Pay per usage
Go to Store

Metadata Extractor
A small efficient actor that loads a web page, parses its HTML using Cheerio library and extracts the following meta-data from the <HEAD> tag, such as page title, description, author etc.
0.0 (0)
Pricing
Pay per usage
13
Total users
1.3k
Monthly users
35
Runs succeeded
86%
Last modified
2 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-basic:v0.21.10
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "0.21.10", "request-promise": "latest", "cheerio": "latest" }, "scripts": { "start": "node main.js" }}
main.js
1const Apify = require('apify');2const request = require('request-promise');3const cheerio = require('cheerio');4
5
6Apify.main(async () => {7 // Get input of the act8 const input = await Apify.getValue('INPUT');9 if (!input || typeof(input.url) !== 'string') {10 throw new Error("Invalid input, it needs to contain 'url' field.");11 }12 13 // Load the web page and extract meta-data14 console.log(`Opening ${input.url}`);15 const html = await request(input.url);16 17 const $ = cheerio.load(html);18 19 const meta = {};20 $('head meta').each(function () {21 const name = $(this).attr('name');22 const content = $(this).attr('content');23 if (name) meta[name] = content ? content.trim() : null;24 });25 26 const result = {27 url: input.url,28 title: ($('head title').text() || '').trim(),29 meta,30 }31
32 // Show and save result33 console.log('Result:');34 console.dir(result);35 await Apify.setValue('OUTPUT', result);36});