Back to template gallery

Playwright + Chrome

Crawler example that uses headless Chrome driven by Playwright to scrape a website. Headless browsers render JavaScript and can help when getting blocked.

Language

python

Tools

playwright

Use cases

Web scraping

from urllib.parse import urljoin

from apify import Actor
from playwright.async_api import async_playwright

# To run this Actor locally, you need to have the Playwright browsers installed.
# Run `playwright install --with-deps` in the Actor's virtual environment to install them.
# When running on the Apify platform, they are already included in the Actor's Docker image.


async def main():
    async with Actor:
        # Read the Actor input
        actor_input = await Actor.get_input() or {}
        start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }])
        max_depth = actor_input.get('max_depth', 1)

        if not start_urls:
            Actor.log.info('No start URLs specified in actor input, exiting...')
            await Actor.exit()

        # Enqueue the starting URLs in the default request queue
        default_queue = await Actor.open_request_queue()
        for start_url in start_urls:
            url = start_url.get('url')
            Actor.log.info(f'Enqueuing {url} ...')
            await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }})

        # Launch Playwright an open a new browser context
        Actor.log.info('Launching Playwright...')
        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(headless=Actor.config.headless)
            context = await browser.new_context()

            # Process the requests in the queue one by one
            while request := await default_queue.fetch_next_request():
                url = request['url']
                depth = request['userData']['depth']
                Actor.log.info(f'Scraping {url} ...')

                try:
                    # Open the URL in a new Playwright page
                    page = await context.new_page()
                    await page.goto(url)

                    # If we haven't reached the max depth,
                    # look for nested links and enqueue their targets
                    if depth < max_depth:
                        for link in await page.locator('a').all():
                            link_href = await link.get_attribute('href')
                            link_url = urljoin(url, link_href)
                            if link_url.startswith(('http://', 'https://')):
                                Actor.log.info(f'Enqueuing {link_url} ...')
                                await default_queue.add_request({
                                    'url': link_url,
                                    'userData': {'depth': depth + 1 },
                                })

                    # Push the title of the page into the default dataset
                    title = await page.title()
                    await Actor.push_data({ 'url': url, 'title': title })
                except:
                    Actor.log.exception(f'Cannot extract data from {url}.')
                finally:
                    await page.close()
                    await default_queue.mark_request_as_handled(request)

Playwright template

Included features

  • Apify SDK for Python - toolkit for building Apify Actors and scrapers in Python
  • Input schema - define and easily validate a schema for your Actor's input
  • Dataset - store structured data where each object stored has the same attributes
  • Playwright - a browser automation library

Resources

A short guide on how to build web scrapers using code templates: web scraper template

Already have a solution in mind?

Sign up for a free Apify account and deploy your code to the platform in just a few minutes! If you want a head start without coding it yourself, browse our Store of existing solutions.