
Example Website Screenshot Crawler
Pricing
Pay per usage
Go to Store

Example Website Screenshot Crawler
dz_omar/example-website-screenshot-crawler
Automated website screenshot crawler using Pyppeteer and Apify. This open-source actor captures screenshots from specified URLs, uploads them to the Apify Key-Value Store, and provides easy access to the results, making it ideal for monitoring website changes and archiving web content.
0.0 (0)
Pricing
Pay per usage
5
Monthly users
10
Runs succeeded
>99%
Last modified
5 months ago
.actor/Dockerfile
1# Use the official Apify Python base image
2FROM python:3.10-slim
3
4# Set environment variables
5ENV PIP_NO_CACHE_DIR=1
6ENV PLAYWRIGHT_BROWSERS_PATH=/usr/local/share/playwright/
7
8# Install system dependencies
9RUN apt-get update && apt-get install -y \
10 curl \
11 unzip \
12 wget \
13 xvfb \
14 libnss3 \
15 libatk-bridge2.0-0 \
16 libgtk-3-0 \
17 libgbm-dev \
18 && apt-get clean \
19 && rm -rf /var/lib/apt/lists/*
20
21# Install Python dependencies
22COPY requirements.txt .
23RUN pip install --no-cache-dir -r requirements.txt
24
25# Install Playwright (optional if using Pyppeteer)
26RUN pip install playwright && playwright install --with-deps
27
28# Create a directory for the actor's files
29RUN mkdir -p /usr/src/app
30WORKDIR /usr/src/app
31
32# Copy all files from the current directory into the container
33COPY . .
34
35
36# Command to run the application
37CMD ["python", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "website-screenshot-crawler",
4 "title": "Automated Website Screenshot Crawler",
5 "description": "This actor takes screenshots of specified websites, uploads them to Apify Key-Value Store, and provides URLs for easy access. It is useful for monitoring website changes, capturing visuals for reports, or web archiving.",
6 "version": "1.0",
7 "meta": {
8 "templateId": "puppeteer_crawler"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile",
12 "buildTag": "latest",
13 "minMemoryMbytes": 4096,
14 "readme": "./README.md",
15 "storages": {
16 "dataset": {
17 "actorSpecification": 1,
18 "title": "Results",
19 "views": {
20 "results": {
21 "title": "results to scan",
22 "transformation": {
23 "fields": ["screenshot_url", "linkUrl"]
24 },
25 "display": {
26 "component": "table",
27 "properties": {
28 "linkUrl": {
29 "label": "linkUrl",
30 "format": "link"
31 },
32 "screenshot_url": {
33 "label": "screenshot_url",
34 "format": "image"
35 }
36 }
37 }
38 }
39 }
40 }
41 }
42}
.actor/input_schema.json
1{
2 "$schema": "http://json-schema.org/draft-07/schema#",
3 "title": "Input Schema",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "fullPage": {
8 "title": "Full Page Screenshot",
9 "type": "boolean",
10 "description": "Indicates whether the screenshot should capture the entire height of the page.",
11 "default": false
12 },
13 "link_urls": {
14 "title": "Link URLs",
15 "type": "array",
16 "description": "A list of URLs to capture screenshots from.",
17 "default": ["https://apify.com"],
18 "editor": "stringList"
19 },
20 "window_Width": {
21 "title": "Browser Window Width",
22 "type": "integer",
23 "description": "The width of the browser window in pixels.",
24 "default": 1920,
25 "unit": "px"
26 },
27 "window_Height": {
28 "title": "Browser Window Height",
29 "type": "integer",
30 "description": "The height of the browser window in pixels.",
31 "default": 1080,
32 "unit": "px"
33 },
34 "Sleep": {
35 "type": "integer",
36 "title": "Sleep Duration",
37 "description": "The duration (in seconds) to wait after loading a page before taking a screenshot.",
38 "default": 10,
39 "editor": "number"
40 },
41 "waitUntil": {
42 "title": "Navigation Wait Condition",
43 "type": "string",
44 "description": "Specify when the navigation should be considered finished. Options are 'load' for when the load event is fired, or 'domcontentloaded' for when the DOM has been loaded.",
45 "editor": "select",
46 "default": "networkidle0",
47 "enum": ["load", "domcontentloaded", "networkidle0", "networkidle2"],
48 "enumTitles": [
49 "Load (all resources loaded)",
50 "DOM Content Loaded (HTML parsed)",
51 "Network Idle (no network connections)",
52 "Network Idle (minimal network connections)"
53 ]
54 },
55 "cookies": {
56 "sectionCaption": "Cookies",
57 "sectionDescription": "You can use cookie editors such as [Cookie Editor](https://cookie-editor.com/) or [Copy Cookies](https://chromewebstore.google.com/detail/copy-cookies/jcbpglbplpblnagieibnemmkiamekcdg) to format cookies.",
58 "title": "Cookies",
59 "type": "array",
60 "description": "Cookies to be used for the browsing session, formatted as JSON objects.",
61 "editor": "json"
62 },
63 "scrollToBottom": {
64 "sectionCaption": "Scrolling Option",
65 "title": "Enable Scrolling to Bottom",
66 "type": "boolean",
67 "description": "Determines whether the page should be scrolled to the bottom before taking a screenshot.",
68 "default": false
69 },
70 "distance": {
71 "title": "Scrolling Distance",
72 "type": "integer",
73 "description": "The distance (in pixels) to scroll down the page during each scroll action. This controls how much content is revealed with each step.",
74 "default": 100,
75 "maximum": 1000,
76 "unit": "px"
77 },
78 "delay": {
79 "title": "Scrolling Delay",
80 "type": "integer",
81 "description": "The delay (in milliseconds) to wait after each scroll action. This can be adjusted based on how quickly content loads after scrolling.",
82 "default": 100,
83 "maximum": 5000,
84 "unit": "ms"
85 }
86 },
87 "required": ["link_urls"],
88 "additionalProperties": true
89}
src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint.
6asyncio.run(main())
src/main.py
1import os
2import random
3import string
4import asyncio
5import aiofiles
6from apify import Actor
7from pyppeteer import launch
8
9class MyActor:
10 def __init__(self, actor_input):
11 self.link_urls = actor_input.get('link_urls')
12 self.sleep_duration = actor_input.get('Sleep') # Default to 10 seconds if not provided
13 self.waitUntil = actor_input.get('waitUntil') # Default to 'networkidle2' if not provided
14 self.cookies = actor_input.get('cookies', [])
15 self.fullPage = actor_input.get('fullPage') # Default to True if not provided
16 self.window_Width = actor_input.get('window_Width') # Default width
17 self.window_Height = actor_input.get('window_Height') # Default height
18
19 self.scrollToBottom = actor_input.get('scrollToBottom') # Default to True
20 # The distance to scroll down the page in pixels for each scroll action.
21 self.distance = actor_input.get('distance') # Configurable scroll distance per step
22
23 # The delay in milliseconds to wait after each scroll action.
24 self.delay = actor_input.get('delay') # Configurable delay between scroll actions
25
26 async def configure_browser(self):
27 """Configures the Pyppeteer browser with the necessary options."""
28 browser = await launch({
29 'headless': True, # Set to False if you want to see the browser
30 'args': [
31 '--no-sandbox',
32 '--disable-setuid-sandbox',
33 '--disable-dev-shm-usage',
34 '--disable-gpu',
35 '--blink-settings=imagesEnabled=false', # Disable image loading
36 ]
37 })
38 return browser
39
40 async def scroll_to_bottom(self, page):
41 """Scrolls to the bottom of the page."""
42 await page.evaluate(f"""
43 async () => {{
44 const distance = {self.distance}; // Scroll by a configurable distance
45 const delay = {self.delay}; // Wait for a configurable delay after each scroll
46 while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {{
47 document.scrollingElement.scrollBy(0, distance);
48 await new Promise(resolve => setTimeout(resolve, delay));
49 }}
50 }}
51 """)
52
53 async def process_link(self, page, link_url):
54 """Processes a single URL: open page, optionally scroll to bottom, take a screenshot, upload it."""
55 try:
56 # Set the cookies before navigating to the page
57 await page.setCookie(*self.cookies)
58
59 # Set the viewport size to specified width and height
60 await page.setViewport({'width': self.window_Width, 'height': self.window_Height})
61
62 # Open the page
63 await page.goto(link_url, {'waitUntil': self.waitUntil}) # Wait until the page is fully loaded
64
65 # Conditionally scroll to the bottom of the page
66 if self.scrollToBottom:
67 await self.scroll_to_bottom(page)
68
69 # Wait for the page to load after scrolling
70 await asyncio.sleep(self.sleep_duration)
71
72 # Generate a random screenshot name
73 screenshot_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16)) + '.png'
74 screenshot_path = f'Image_Files/{screenshot_name}'
75
76 # Take a full-page screenshot
77 await page.screenshot({'path': screenshot_path, 'fullPage': self.fullPage})
78
79 # Upload the screenshot to Apify's Key-Value Store
80 async with aiofiles.open(screenshot_path, 'rb') as f:
81 screenshot_content = await f.read()
82
83 store_id = Actor.config.default_key_value_store_id
84 await Actor.set_value(screenshot_name, screenshot_content)
85
86 screenshot_url = f'https://api.apify.com/v2/key-value-stores/{store_id}/records/{screenshot_name}'
87 await Actor.push_data({'screenshot_url': screenshot_url, 'linkUrl': link_url})
88
89 print(f'Screenshot for {link_url} saved.')
90
91 except Exception as e:
92 print(f"Error processing {link_url}: {str(e)}")
93
94 async def run(self):
95 """Main execution logic."""
96 os.makedirs('Image_Files', exist_ok=True)
97
98 # Configure and launch the browser
99 browser = await self.configure_browser()
100 page = await browser.newPage()
101
102 for link_url in self.link_urls:
103 await self.process_link(page, link_url)
104
105 await browser.close()
106
107async def main():
108 async with Actor:
109 actor_input = await Actor.get_input() or {}
110 my_actor = MyActor(actor_input)
111 await my_actor.run()
112
113if __name__ == "__main__":
114 asyncio.run(main())
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify==2.0.0
5pyppeteer
6aiofiles
7requests
8tenacity
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.