Example Puppeteer Promise Pool

  • mtrunkat/puppeteer-promise-pool-example
  • Modified
  • Users 16
  • Runs 591
  • Created by Author's avatarMarek Trunkát

Example how to use Puppeteer in parallel using 'es6-promise-pool' npm package.

Example Puppeteer Promise Pool

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-puppeteer:beta

# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && (npm list --all || true) \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version

# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY --chown=node:node . ./

# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

{
    "name": "apify-project",
    "version": "0.0.1",
    "description": "",
    "author": "It's not you it's me",
    "license": "ISC",
    "dependencies": {
        "apify": "latest",
        "es6-promise-pool": "latest"
    },
    "scripts": {
        "start": "node main.js"
    }
}

main.js

const Apify = require('apify');
const PromisePool = require('es6-promise-pool');

// How may urls we want to process in parallel.
const CONCURRENCY = 5;

// Urls to process.
const URLS = [
    'http://example.com',
    'http://news.ycombinator.com',
    'https://news.ycombinator.com/news?p=2',
    'https://news.ycombinator.com/news?p=3',
    'https://news.ycombinator.com/news?p=4',
    'https://news.ycombinator.com/news?p=5',
    'https://www.reddit.com/',
];

let browser;
let results = [];

// This function returns promise that gets resolved once Puppeteer
// opens url, evaluates content and closes it.
const crawlUrl = async (url) => {
    const page = await browser.newPage();
        
    console.log(`Opening ${url}`);
    await page.goto(url);
        
    console.log(`Evaluating ${url}`);
    const result = await page.evaluate(() => {
        return {
            title: document.title,
            url: window.location.href,
        };
    });
        
    results.push(result);
        
    console.log(`Closing ${url}`);
    await page.close();
};

// Every time it's called takes one url from URLS constant and returns 
// crawlUrl(url) promise. When URLS gets empty returns null.
const promiseProducer = () => {
    const url = URLS.pop();
    
    return url ? crawlUrl(url) : null;
};

Apify.main(async () => {
    // Starts browser.
    browser = await Apify.launchPuppeteer();

    // Runs thru all the urls in a pool of given concurrency.
    const pool = new PromisePool(promiseProducer, CONCURRENCY);
    await pool.start();
    
    // Print results.
    console.log('Results:');
    console.log(JSON.stringify(results, null, 2));
    
    await Apify.setValue('OUTPUT', results);
    await browser.close();
});