Metadata Extractor

No credit card required

Metadata Extractor

Metadata Extractor

jancurn/extract-metadata

No credit card required

A small efficient actor that loads a web page, parses its HTML using Cheerio library and extracts the following meta-data from the <HEAD> tag, such as page title, description, author etc.

Dockerfile

1FROM apify/actor-node:16 2COPY package*.json ./ 3RUN npm --quiet set progress=false \ 4 && npm install --only=prod --no-optional \ 5 && echo "Installed NPM packages:" \ 6 && (npm list --only=prod --no-optional --all || true) \ 7 && echo "Node.js version:" \ 8 && node --version \ 9 && echo "NPM version:" \ 10 && npm --version 11COPY . ./

INPUT_SCHEMA.json

1{ 2 "title": "Schema for the jancurn/metadata-extractor actor", 3 "type": "object", 4 "schemaVersion": 1, 5 "properties": { 6 "urls": { 7 "title": "Page URLs", 8 "type": "array", 9 "description": "URLs of the web pages to extract the metadata from. They must have either HTTP or HTTPS scheme.", 10 "editor": "stringList", 11 "prefill": ["https://www.apify.com/", "https://blog.apify.com"] 12 }, 13 "proxy": { 14 "title": "Proxy configuration", 15 "type": "object", 16 "description": "Select proxies to be used by your crawler.", 17 "prefill": { "useApifyProxy": true }, 18 "editor": "proxy" 19 } 20 }, 21 "required": ["urls"] 22} 23

main.js

1const Apify = require('apify'); 2 3const { log } = Apify.utils; 4 5Apify.main(async () => { 6 const input = await Apify.getInput(); 7 const { urls = [], proxy = { useApifyProxy: false } } = input 8 9 if (input.url) urls.push(input.url) 10 11 const requests = []; 12 for (const url of urls) { 13 if (!new URL(url)) throw new Error('All URLs must be valid URLs!'); 14 requests.push({ url }); 15 } 16 17 const requestList = await Apify.openRequestList('start-urls', requests); 18 const proxyConfiguration = await Apify.createProxyConfiguration({ ...proxy }); 19 20 const crawler = new Apify.CheerioCrawler({ 21 requestList, 22 proxyConfiguration, 23 maxConcurrency: 50, 24 handlePageFunction: async ({ $, request }) => { 25 const meta = {}; 26 27 for (const tag of $('head meta')) { 28 const name = $(tag).attr('name') || $(tag).attr('property') || $(tag).attr('http-equiv'); 29 const content = $(tag).attr('content'); 30 if (name) meta[name] = content ? content.trim() : null; 31 } 32 33 const result = { 34 url: request.url, 35 title: ($('head title').text() || '').trim(), 36 meta, 37 }; 38 39 return Apify.pushData(result); 40 }, 41 }); 42 43 log.info('Starting the crawl...'); 44 await crawler.run(); 45 log.info('Scraping finished! Metadata for each site is available in "Results".'); 46}); 47

package.json

1{ 2 "name": "extract-metadata", 3 "version": "0.0.2", 4 "description": "Metadata extractor.", 5 "dependencies": { 6 "apify": "^2.0.7" 7 }, 8 "scripts": { 9 "start": "node main.js" 10 }, 11 "author": "Jan Curn" 12}
Developer
Maintained by Community
Actor stats
  • 982 users
  • 620.6k runs
  • Modified about 2 months ago
Categories

You might also like these Actors