
Example Website Screenshot Crawler
Pricing
Pay per usage
Go to Store

Example Website Screenshot Crawler
Automated website screenshot crawler using Pyppeteer and Apify. This open-source actor captures screenshots from specified URLs, uploads them to the Apify Key-Value Store, and provides easy access to the results, making it ideal for monitoring website changes and archiving web content.
0.0 (0)
Pricing
Pay per usage
6
Total users
50
Monthly users
11
Runs succeeded
>99%
Last modified
8 months ago
.actor/Dockerfile
# Use the official Apify Python base imageFROM python:3.10-slim
# Set environment variablesENV PIP_NO_CACHE_DIR=1ENV PLAYWRIGHT_BROWSERS_PATH=/usr/local/share/playwright/
# Install system dependenciesRUN apt-get update && apt-get install -y \ curl \ unzip \ wget \ xvfb \ libnss3 \ libatk-bridge2.0-0 \ libgtk-3-0 \ libgbm-dev \ && apt-get clean \ && rm -rf /var/lib/apt/lists/*
# Install Python dependenciesCOPY requirements.txt .RUN pip install --no-cache-dir -r requirements.txt
# Install Playwright (optional if using Pyppeteer)RUN pip install playwright && playwright install --with-deps
# Create a directory for the actor's filesRUN mkdir -p /usr/src/appWORKDIR /usr/src/app
# Copy all files from the current directory into the containerCOPY . .
# Command to run the applicationCMD ["python", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "website-screenshot-crawler", "title": "Automated Website Screenshot Crawler", "description": "This actor takes screenshots of specified websites, uploads them to Apify Key-Value Store, and provides URLs for easy access. It is useful for monitoring website changes, capturing visuals for reports, or web archiving.", "version": "1.0", "meta": { "templateId": "puppeteer_crawler" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "buildTag": "latest", "minMemoryMbytes": 4096, "readme": "./README.md", "storages": { "dataset": { "actorSpecification": 1, "title": "Results", "views": { "results": { "title": "results to scan", "transformation": { "fields": ["screenshot_url", "linkUrl"] }, "display": { "component": "table", "properties": { "linkUrl": { "label": "linkUrl", "format": "link" }, "screenshot_url": { "label": "screenshot_url", "format": "image" } } } } } } }}
.actor/input_schema.json
{ "$schema": "http://json-schema.org/draft-07/schema#", "title": "Input Schema", "type": "object", "schemaVersion": 1, "properties": { "fullPage": { "title": "Full Page Screenshot", "type": "boolean", "description": "Indicates whether the screenshot should capture the entire height of the page.", "default": false }, "link_urls": { "title": "Link URLs", "type": "array", "description": "A list of URLs to capture screenshots from.", "default": ["https://apify.com"], "editor": "stringList" }, "window_Width": { "title": "Browser Window Width", "type": "integer", "description": "The width of the browser window in pixels.", "default": 1920, "unit": "px" }, "window_Height": { "title": "Browser Window Height", "type": "integer", "description": "The height of the browser window in pixels.", "default": 1080, "unit": "px" }, "Sleep": { "type": "integer", "title": "Sleep Duration", "description": "The duration (in seconds) to wait after loading a page before taking a screenshot.", "default": 10, "editor": "number" }, "waitUntil": { "title": "Navigation Wait Condition", "type": "string", "description": "Specify when the navigation should be considered finished. Options are 'load' for when the load event is fired, or 'domcontentloaded' for when the DOM has been loaded.", "editor": "select", "default": "networkidle0", "enum": ["load", "domcontentloaded", "networkidle0", "networkidle2"], "enumTitles": [ "Load (all resources loaded)", "DOM Content Loaded (HTML parsed)", "Network Idle (no network connections)", "Network Idle (minimal network connections)" ] }, "cookies": { "sectionCaption": "Cookies", "sectionDescription": "You can use cookie editors such as [Cookie Editor](https://cookie-editor.com/) or [Copy Cookies](https://chromewebstore.google.com/detail/copy-cookies/jcbpglbplpblnagieibnemmkiamekcdg) to format cookies.", "title": "Cookies", "type": "array", "description": "Cookies to be used for the browsing session, formatted as JSON objects.", "editor": "json" }, "scrollToBottom": { "sectionCaption": "Scrolling Option", "title": "Enable Scrolling to Bottom", "type": "boolean", "description": "Determines whether the page should be scrolled to the bottom before taking a screenshot.", "default": false }, "distance": { "title": "Scrolling Distance", "type": "integer", "description": "The distance (in pixels) to scroll down the page during each scroll action. This controls how much content is revealed with each step.", "default": 100, "maximum": 1000, "unit": "px" }, "delay": { "title": "Scrolling Delay", "type": "integer", "description": "The delay (in milliseconds) to wait after each scroll action. This can be adjusted based on how quickly content loads after scrolling.", "default": 100, "maximum": 5000, "unit": "ms" } }, "required": ["link_urls"], "additionalProperties": true}
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entrypoint.6asyncio.run(main())
src/main.py
1import os2import random3import string4import asyncio5import aiofiles6from apify import Actor7from pyppeteer import launch8
9class MyActor:10 def __init__(self, actor_input):11 self.link_urls = actor_input.get('link_urls')12 self.sleep_duration = actor_input.get('Sleep') # Default to 10 seconds if not provided13 self.waitUntil = actor_input.get('waitUntil') # Default to 'networkidle2' if not provided 14 self.cookies = actor_input.get('cookies', [])15 self.fullPage = actor_input.get('fullPage') # Default to True if not provided16 self.window_Width = actor_input.get('window_Width') # Default width17 self.window_Height = actor_input.get('window_Height') # Default height18
19 self.scrollToBottom = actor_input.get('scrollToBottom') # Default to True20 # The distance to scroll down the page in pixels for each scroll action.21 self.distance = actor_input.get('distance') # Configurable scroll distance per step22
23 # The delay in milliseconds to wait after each scroll action.24 self.delay = actor_input.get('delay') # Configurable delay between scroll actions25
26 async def configure_browser(self):27 """Configures the Pyppeteer browser with the necessary options."""28 browser = await launch({29 'headless': True, # Set to False if you want to see the browser30 'args': [31 '--no-sandbox',32 '--disable-setuid-sandbox',33 '--disable-dev-shm-usage',34 '--disable-gpu',35 '--blink-settings=imagesEnabled=false', # Disable image loading36 ]37 })38 return browser39
40 async def scroll_to_bottom(self, page):41 """Scrolls to the bottom of the page."""42 await page.evaluate(f"""43 async () => {{44 const distance = {self.distance}; // Scroll by a configurable distance45 const delay = {self.delay}; // Wait for a configurable delay after each scroll46 while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {{47 document.scrollingElement.scrollBy(0, distance);48 await new Promise(resolve => setTimeout(resolve, delay));49 }}50 }}51 """)52
53 async def process_link(self, page, link_url):54 """Processes a single URL: open page, optionally scroll to bottom, take a screenshot, upload it."""55 try:56 # Set the cookies before navigating to the page57 await page.setCookie(*self.cookies)58
59 # Set the viewport size to specified width and height60 await page.setViewport({'width': self.window_Width, 'height': self.window_Height})61
62 # Open the page63 await page.goto(link_url, {'waitUntil': self.waitUntil}) # Wait until the page is fully loaded64
65 # Conditionally scroll to the bottom of the page66 if self.scrollToBottom:67 await self.scroll_to_bottom(page)68
69 # Wait for the page to load after scrolling70 await asyncio.sleep(self.sleep_duration)71
72 # Generate a random screenshot name73 screenshot_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16)) + '.png'74 screenshot_path = f'Image_Files/{screenshot_name}'75
76 # Take a full-page screenshot77 await page.screenshot({'path': screenshot_path, 'fullPage': self.fullPage})78
79 # Upload the screenshot to Apify's Key-Value Store80 async with aiofiles.open(screenshot_path, 'rb') as f:81 screenshot_content = await f.read()82
83 store_id = Actor.config.default_key_value_store_id84 await Actor.set_value(screenshot_name, screenshot_content)85
86 screenshot_url = f'https://api.apify.com/v2/key-value-stores/{store_id}/records/{screenshot_name}'87 await Actor.push_data({'screenshot_url': screenshot_url, 'linkUrl': link_url})88
89 print(f'Screenshot for {link_url} saved.')90
91 except Exception as e:92 print(f"Error processing {link_url}: {str(e)}")93
94 async def run(self):95 """Main execution logic."""96 os.makedirs('Image_Files', exist_ok=True)97 98 # Configure and launch the browser99 browser = await self.configure_browser()100 page = await browser.newPage()101
102 for link_url in self.link_urls:103 await self.process_link(page, link_url)104
105 await browser.close()106
107async def main():108 async with Actor:109 actor_input = await Actor.get_input() or {}110 my_actor = MyActor(actor_input)111 await my_actor.run()112
113if __name__ == "__main__":114 asyncio.run(main())
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify==2.0.05pyppeteer6aiofiles7requests8tenacity