Algolia Webcrawler avatar
Algolia Webcrawler
Try for free

No credit card required

View all Actors
Algolia Webcrawler

Algolia Webcrawler

jancurn/algolia-webcrawler
Try for free

No credit card required

Crawls a website using one or more sitemaps and imports the data to Algolia search index. The text content is identified using simple CSS selectors.

Dockerfile

1FROM apify/actor-node-basic
2
3# First, copy package.json since it affects NPM install
4COPY package.json ./
5
6# Install NPM packages, skip optional and development dependencies to
7# keep the image small. Avoid logging to much and print the dependency
8# tree for debugging
9RUN npm --quiet set progress=false \
10 && npm install --only=prod --no-optional \
11 && echo "Installed NPM packages:" \
12 && npm list \
13 && echo "Node.js version:" \
14 && node --version \
15 && echo "NPM version:" \
16 && npm --version
17
18# Lastly, copy remaining files and directories with the source code.
19# This way, quick build will not need to reinstall packages on a simple change.
20COPY . ./
21
22# Specify how to run the source code
23CMD npm start

main.js

1const fs = require('fs');
2const tmp = require('tmp');
3const Apify = require('apify');
4
5// Hack to circumvent strange error exit code masking in alogila-crawler
6// (see https://github.com/DeuxHuitHuit/algolia-webcrawler/blob/master/app.js#L29)
7process.on('exit', (code) => {
8    console.log('Exiting the process with code ' + code);
9	process.exit(code);
10});
11
12(async function () {
13    try {
14        // Get input of your actor
15        const input = await Apify.getValue('INPUT');
16        console.log('Input fetched:');
17        console.dir(input);
18        
19        // From algolia-webcrawler docs:
20        // "At the bare minimum, you can edit config.json to set a values to the following options:
21        //  'app', 'cred', 'indexname' and at least one 'sitemap' object. If you have multiple sitemaps,
22        //  please list them all: sub-sitemaps will not be crawled."
23        if (!input || !input.app || !input.cred || !input.index || !input.sitemaps) {
24            console.error('The input must be a JSON config file with fields as required by algolia-webcrawler package.');
25            console.error('For details, see https://www.npmjs.com/package/algolia-webcrawler');
26            process.exit(33);
27        }
28        
29        var tmpobj = tmp.fileSync({ prefix: 'aloglia-input-', postfix: '.json' });
30        console.log(`Writing input JSON to file ${tmpobj.name}`);
31        fs.writeFileSync(tmpobj.name, JSON.stringify(input, null, 2));
32        
33        console.log(`Emulating command: node algolia-webcrawler --config ${tmpobj.name}`);
34        process.argv[2] = '--config';
35        process.argv[3] = tmpobj.name;
36        const webcrawler = require('algolia-webcrawler');
37    } catch (e) {
38        console.error(e.stack || e);
39        process.exit(34);
40    }
41})();

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^0.14.3",
6        "tmp": "^0.1.0",
7        "algolia-webcrawler": "^3.2.0"
8    },
9    "scripts": {
10        "start": "node main.js"
11    },
12    "author": "Me!"
13}
Developer
Maintained by Community
Actor metrics
  • 3 monthly users
  • 0.0% runs succeeded
  • 41.4 days response time
  • Created in Oct 2018
  • Modified about 3 years ago
Categories