
Example Puppeteer Promise Pool
- mtrunkat/puppeteer-promise-pool-example
- Modified
- Users 16
- Runs 591
- Created by
Marek Trunkát
Example how to use Puppeteer in parallel using 'es6-promise-pool' npm package.
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-puppeteer:beta
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "latest",
"es6-promise-pool": "latest"
},
"scripts": {
"start": "node main.js"
}
}
main.js
const Apify = require('apify');
const PromisePool = require('es6-promise-pool');
// How may urls we want to process in parallel.
const CONCURRENCY = 5;
// Urls to process.
const URLS = [
'http://example.com',
'http://news.ycombinator.com',
'https://news.ycombinator.com/news?p=2',
'https://news.ycombinator.com/news?p=3',
'https://news.ycombinator.com/news?p=4',
'https://news.ycombinator.com/news?p=5',
'https://www.reddit.com/',
];
let browser;
let results = [];
// This function returns promise that gets resolved once Puppeteer
// opens url, evaluates content and closes it.
const crawlUrl = async (url) => {
const page = await browser.newPage();
console.log(`Opening ${url}`);
await page.goto(url);
console.log(`Evaluating ${url}`);
const result = await page.evaluate(() => {
return {
title: document.title,
url: window.location.href,
};
});
results.push(result);
console.log(`Closing ${url}`);
await page.close();
};
// Every time it's called takes one url from URLS constant and returns
// crawlUrl(url) promise. When URLS gets empty returns null.
const promiseProducer = () => {
const url = URLS.pop();
return url ? crawlUrl(url) : null;
};
Apify.main(async () => {
// Starts browser.
browser = await Apify.launchPuppeteer();
// Runs thru all the urls in a pool of given concurrency.
const pool = new PromisePool(promiseProducer, CONCURRENCY);
await pool.start();
// Print results.
console.log('Results:');
console.log(JSON.stringify(results, null, 2));
await Apify.setValue('OUTPUT', results);
await browser.close();
});