Dockerfile
1FROM apify/actor-node:16
2COPY package*.json ./
3RUN npm --quiet set progress=false \
4 && npm install --only=prod --no-optional \
5 && echo "Installed NPM packages:" \
6 && (npm list --only=prod --no-optional --all || true) \
7 && echo "Node.js version:" \
8 && node --version \
9 && echo "NPM version:" \
10 && npm --version
11COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "Schema for the jancurn/metadata-extractor actor",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "urls": {
7 "title": "Page URLs",
8 "type": "array",
9 "description": "URLs of the web pages to extract the metadata from. They must have either HTTP or HTTPS scheme.",
10 "editor": "stringList",
11 "prefill": ["https://www.apify.com/", "https://blog.apify.com"]
12 },
13 "proxy": {
14 "title": "Proxy configuration",
15 "type": "object",
16 "description": "Select proxies to be used by your crawler.",
17 "prefill": { "useApifyProxy": true },
18 "editor": "proxy"
19 }
20 },
21 "required": ["urls"]
22}
23
main.js
1const Apify = require('apify');
2
3const { log } = Apify.utils;
4
5Apify.main(async () => {
6 const input = await Apify.getInput();
7 const { urls = [], proxy = { useApifyProxy: false } } = input
8
9 if (input.url) urls.push(input.url)
10
11 const requests = [];
12 for (const url of urls) {
13 if (!new URL(url)) throw new Error('All URLs must be valid URLs!');
14 requests.push({ url });
15 }
16
17 const requestList = await Apify.openRequestList('start-urls', requests);
18 const proxyConfiguration = await Apify.createProxyConfiguration({ ...proxy });
19
20 const crawler = new Apify.CheerioCrawler({
21 requestList,
22 proxyConfiguration,
23 maxConcurrency: 50,
24 handlePageFunction: async ({ $, request }) => {
25 const meta = {};
26
27 for (const tag of $('head meta')) {
28 const name = $(tag).attr('name') || $(tag).attr('property') || $(tag).attr('http-equiv');
29 const content = $(tag).attr('content');
30 if (name) meta[name] = content ? content.trim() : null;
31 }
32
33 const result = {
34 url: request.url,
35 title: ($('head title').text() || '').trim(),
36 meta,
37 };
38
39 return Apify.pushData(result);
40 },
41 });
42
43 log.info('Starting the crawl...');
44 await crawler.run();
45 log.info('Scraping finished! Metadata for each site is available in "Results".');
46});
47
package.json
1{
2 "name": "extract-metadata",
3 "version": "0.0.2",
4 "description": "Metadata extractor.",
5 "dependencies": {
6 "apify": "^2.0.7"
7 },
8 "scripts": {
9 "start": "node main.js"
10 },
11 "author": "Jan Curn"
12}
Developer
Maintained by Community
Actor stats
- 982 users
- 620.6k runs
- Modified about 2 months ago
Categories