FROM apify/actor-node-chrome:beta

COPY package.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
 && npm install --only=prod --no-optional \
 && echo "Installed NPM packages:" \
 && npm list || true \
 && echo "Node.js version:" \
 && node --version \
 && echo "NPM version:" \
 && npm --version \
 && echo "Google Chrome version:" \
 && bash -c "$APIFY_CHROME_EXECUTABLE_PATH --version" \
 && echo "ChromeDriver version:" \
 && chromedriver --version

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./

# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
# CMD npm start

INPUT_SCHEMA.json

{
    "title": "Input schema for Selenium example",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "url": {
            "title": "Url",
            "type": "string",
            "description": "Url to open with the selenium webdriver",
            "editor": "textfield",
            "prefill": "https://www.example.com"
        },
        "userAgent": {
            "title": "User agent",
            "type": "string",
            "description": "If you want to specify user agent to use, you can do it here",
            "editor": "textfield",
            "prefill": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        },
        "proxy": {
            "title": "Proxy configuration",
            "type": "object",
            "description": "Select proxies to be used by your crawler.",
            "prefill": { "useApifyProxy": true },
            "editor": "proxy"
        }
    },
    "required": ["url"]
}

main.js

1const Apify = require('apify');
2const { Capabilities, Builder, logging } = require('selenium-webdriver');
3const chrome = require('selenium-webdriver/chrome');
4const proxy = require('selenium-webdriver/proxy');
5const { anonymizeProxy } = require('proxy-chain');
6
7const launchChromeWebdriver = async (options) => {
8    let anonymizedProxyUrl = null;
9
10    // logging.installConsoleHandler();
11    // logging.getLogger('webdriver.http').setLevel(logging.Level.ALL);
12
13    // See https://github.com/SeleniumHQ/selenium/wiki/DesiredCapabilities for reference.
14    const capabilities = new Capabilities();
15    capabilities.set('browserName', 'chrome');
16
17    // Chrome-specific options
18    // By default, Selenium already defines a long list of command-line options
19    // to enable browser automation, here we add a few other ones
20    // (inspired by Lighthouse, see lighthouse/lighthouse-cli/chrome-launcher)
21    const chromeOptions = new chrome.Options();
22    chromeOptions.addArguments('--disable-translate');
23    chromeOptions.addArguments('--safebrowsing-disable-auto-update');
24
25    if (options.headless) {
26        chromeOptions.addArguments('--headless', '--no-sandbox');
27    }
28
29    if (options.userAgent) {
30        chromeOptions.addArguments(`--user-agent=${options.userAgent}`);
31    } 
32
33    if (options.extraChromeArguments) {
34        chromeOptions.addArguments(options.extraChromeArguments);
35    }
36
37    const builder = new Builder();
38
39    // For proxy servers with authentication, this class starts a local proxy server
40    // NOTE: to view effective proxy settings in Chrome, open chrome://net-internals/#proxy
41    if (options.proxyUrl) {
42        const anonymizedProxyUrl = await anonymizeProxy(options.proxyUrl)
43        chromeOptions.addArguments(`--proxy-server=${anonymizedProxyUrl}`);
44    }
45
46    const webDriver = builder
47        .setChromeOptions(chromeOptions)
48        .withCapabilities(capabilities)
49        .build();
50
51    return webDriver;
52};
53
54
55Apify.main(async () => {
56    const input = await Apify.getInput();
57    console.log('Input:');
58    console.dir(input);
59
60    // Prepare proxy URL
61    let proxyUrl = '';
62    if (input.proxy) {
63        const { useApifyProxy, apifyProxyGroups, proxyUrls } = input.proxy;
64        if (useApifyProxy) {
65            proxyUrl = Apify.getApifyProxyUrl({ groups: apifyProxyGroups });
66        } else if ((proxyUrls || []).length) {
67            proxyUrl = proxyUrls[Math.floor(Math.random * proxyUrls.length)];
68        }
69    }
70
71    // Prepare user agent
72    let userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36';
73    if (input.userAgent) userAgent = input.userAgent;
74
75    const options = {
76        proxyUrl,
77        userAgent,
78        headless: true,
79    }
80
81    console.log('Launching Selenium WebDriver...');
82    const webDriver = await launchChromeWebdriver(options);
83
84    console.log(`Opening ${input.url}...`);
85    await webDriver.get(input.url);
86
87    const currentUrl = await webDriver.getCurrentUrl();
88    console.log(`Current url ${currentUrl}`);
89                   
90    console.log('Getting the page title...');
91    const pageTitle = await webDriver.executeScript(() => {
92        return document.title;
93    });
94    console.log(`Page title: ${pageTitle}`);
95                   
96    console.log('Taking screenshot...');
97    const screenshot = await webDriver.takeScreenshot();
98    await Apify.setValue('screenshot.png', Buffer.from(screenshot, 'base64'), { contentType: 'image/png' });
99
100    console.log('Getting the page html code...');
101    const source = await webDriver.getPageSource();
102    await Apify.setValue('source.html', source, { contentType: 'text/html'});
103    
104    console.log('Saving output...');
105    await Apify.setValue('OUTPUT', {
106        url: input.url,
107        pageTitle, 
108    });
109    
110    console.log('Done.')
111});

package.json

{
    "name": "selenium-chrome-example",
    "version": "0.0.1",
    "dependencies": {
        "apify": "^0.16.0",
        "proxy-chain": "^0.3.2",
        "selenium-webdriver": "^3.0.0"
    },
    "scripts": {
        "start": "node main.js"
    },
    "author": "Apify"
}

Selenium with wait

decorative_quorum/selenium-with-wait

Runs a simple selenium-based scrape of a site, but waits a given amount of time for the broswer to load the page

José Eduardo Piña Castro

BeautifulSoup Scraper

apify/beautifulsoup-scraper

Crawls websites using raw HTTP requests. It parses the HTML with the BeautifulSoup library and extracts data from the pages using Python code. Supports both recursive crawling and lists of URLs. This Actor is a Python alternative to Cheerio Scraper.

Apify

885

4.2

Actor in Go example

jirimoravcik/go-actor-example

Example actor written in Go.

Jiří Moravčík

6thstreet Selenium Link Scraper

mumagdy/6thstreet-selenium-link-scraper

Muhammed Magdy

Example Public Actor Input

lukaskrivka/public-actor-input-example

Example of input schema for public scraping actors

Lukáš Křivka

Selenium Custom Firefox POC

jancurn/selenium-custom-firefox

Uses Selenium to run custom build of Firefox that is harder to detect. The actor just saves a screenshot and snapshot of the HTML.

Jan Čurn

Ruby Example

drobnikj/ruby-example

Example of ruby code in actor

Jakub Drobník

Charlotte Tilbury Selenium Link Scraper

mumagdy/Charlotte-Tilbury-selenium-link-scraper

Muhammed Magdy

Linkedin Job Scraping

quaking_pail/linkedin-job-scraping

This Apify actor scrapes job listings from LinkedIn based on user-defined search parameters. It uses Selenium with a headless Chrome browser to navigate LinkedIn job pages and extract comprehensive job details.