Metadata Extractor avatar
Metadata Extractor
Try for free

No credit card required

View all Actors
Metadata Extractor

Metadata Extractor

jancurn/extract-metadata
Try for free

No credit card required

A small efficient actor that loads a web page, parses its HTML using Cheerio library and extracts the following meta-data from the <HEAD> tag, such as page title, description, author etc.

Dockerfile

1FROM apify/actor-node:16
2COPY package*.json ./
3RUN npm --quiet set progress=false \
4 && npm install --only=prod --no-optional \
5 && echo "Installed NPM packages:" \
6 && (npm list --only=prod --no-optional --all || true) \
7 && echo "Node.js version:" \
8 && node --version \
9 && echo "NPM version:" \
10 && npm --version
11COPY . ./

INPUT_SCHEMA.json

1{
2    "title": "Schema for the jancurn/metadata-extractor actor",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "urls": {
7            "title": "Page URLs",
8            "type": "array",
9            "description": "URLs of the web pages to extract the metadata from. They must have either HTTP or HTTPS scheme.",
10            "editor": "stringList",
11            "prefill": ["https://www.apify.com/", "https://blog.apify.com"]
12        },
13        "proxy": {
14            "title": "Proxy configuration",
15            "type": "object",
16            "description": "Select proxies to be used by your crawler.",
17            "prefill": { "useApifyProxy": true },
18            "editor": "proxy"
19        }
20    },
21    "required": ["urls"]
22}

main.js

1const Apify = require('apify');
2
3const { log } = Apify.utils;
4
5Apify.main(async () => {
6    const input = await Apify.getInput();
7    const { urls = [], proxy = { useApifyProxy: false } } = input
8
9    if (input.url) urls.push(input.url)
10
11    const requests = [];
12    for (const url of urls) {
13        if (!new URL(url)) throw new Error('All URLs must be valid URLs!');
14        requests.push({ url });
15    }
16
17    const requestList = await Apify.openRequestList('start-urls', requests);
18    const proxyConfiguration = await Apify.createProxyConfiguration({ ...proxy });
19
20    const crawler = new Apify.CheerioCrawler({
21        requestList,
22        proxyConfiguration,
23        maxConcurrency: 50,
24        handlePageFunction: async ({ $, request }) => {
25            const meta = {};
26
27            for (const tag of $('head meta')) {
28                const name = $(tag).attr('name') || $(tag).attr('property') || $(tag).attr('http-equiv');
29                const content = $(tag).attr('content');
30                if (name) meta[name] = content ? content.trim() : null;
31            }
32
33            const result = {
34                url: request.url,
35                title: ($('head title').text() || '').trim(),
36                meta,
37            };
38
39            return Apify.pushData(result);
40        },
41    });
42
43    log.info('Starting the crawl...');
44    await crawler.run();
45    log.info('Scraping finished! Metadata for each site is available in "Results".');
46});

package.json

1{
2	"name": "extract-metadata",
3    "version": "0.0.2",
4    "description": "Metadata extractor.",
5	"dependencies": {
6		"apify": "^2.0.7"
7	},
8    "scripts": {
9        "start": "node main.js"
10    },
11    "author": "Jan Curn"
12}
Developer
Maintained by Community
Actor metrics
  • 35 monthly users
  • 39.0% runs succeeded
  • 0.0 days response time
  • Created in Feb 2018
  • Modified 7 months ago
Categories