Algolia Webcrawler
Try for free
No credit card required
View all Actors
Algolia Webcrawler
jancurn/algolia-webcrawler
Try for free
No credit card required
Crawls a website using one or more sitemaps and imports the data to Algolia search index. The text content is identified using simple CSS selectors.
Dockerfile
1FROM apify/actor-node-basic
2
3# First, copy package.json since it affects NPM install
4COPY package.json ./
5
6# Install NPM packages, skip optional and development dependencies to
7# keep the image small. Avoid logging to much and print the dependency
8# tree for debugging
9RUN npm --quiet set progress=false \
10 && npm install --only=prod --no-optional \
11 && echo "Installed NPM packages:" \
12 && npm list \
13 && echo "Node.js version:" \
14 && node --version \
15 && echo "NPM version:" \
16 && npm --version
17
18# Lastly, copy remaining files and directories with the source code.
19# This way, quick build will not need to reinstall packages on a simple change.
20COPY . ./
21
22# Specify how to run the source code
23CMD npm start
main.js
1const fs = require('fs');
2const tmp = require('tmp');
3const Apify = require('apify');
4
5// Hack to circumvent strange error exit code masking in alogila-crawler
6// (see https://github.com/DeuxHuitHuit/algolia-webcrawler/blob/master/app.js#L29)
7process.on('exit', (code) => {
8 console.log('Exiting the process with code ' + code);
9 process.exit(code);
10});
11
12(async function () {
13 try {
14 // Get input of your actor
15 const input = await Apify.getValue('INPUT');
16 console.log('Input fetched:');
17 console.dir(input);
18
19 // From algolia-webcrawler docs:
20 // "At the bare minimum, you can edit config.json to set a values to the following options:
21 // 'app', 'cred', 'indexname' and at least one 'sitemap' object. If you have multiple sitemaps,
22 // please list them all: sub-sitemaps will not be crawled."
23 if (!input || !input.app || !input.cred || !input.index || !input.sitemaps) {
24 console.error('The input must be a JSON config file with fields as required by algolia-webcrawler package.');
25 console.error('For details, see https://www.npmjs.com/package/algolia-webcrawler');
26 process.exit(33);
27 }
28
29 var tmpobj = tmp.fileSync({ prefix: 'aloglia-input-', postfix: '.json' });
30 console.log(`Writing input JSON to file ${tmpobj.name}`);
31 fs.writeFileSync(tmpobj.name, JSON.stringify(input, null, 2));
32
33 console.log(`Emulating command: node algolia-webcrawler --config ${tmpobj.name}`);
34 process.argv[2] = '--config';
35 process.argv[3] = tmpobj.name;
36 const webcrawler = require('algolia-webcrawler');
37 } catch (e) {
38 console.error(e.stack || e);
39 process.exit(34);
40 }
41})();
package.json
1{
2 "name": "my-actor",
3 "version": "0.0.1",
4 "dependencies": {
5 "apify": "^0.14.3",
6 "tmp": "^0.1.0",
7 "algolia-webcrawler": "^3.2.0"
8 },
9 "scripts": {
10 "start": "node main.js"
11 },
12 "author": "Me!"
13}