Example Website Screenshot Crawler
Try for free
No credit card required
Go to Store
Example Website Screenshot Crawler
dz_omar/example-website-screenshot-crawler
Try for free
No credit card required
Automated website screenshot crawler using Pyppeteer and Apify. This open-source actor captures screenshots from specified URLs, uploads them to the Apify Key-Value Store, and provides easy access to the results, making it ideal for monitoring website changes and archiving web content.
.actor/Dockerfile
1# Use the official Apify Python base image
2FROM python:3.10-slim
3
4# Set environment variables
5ENV PIP_NO_CACHE_DIR=1
6ENV PLAYWRIGHT_BROWSERS_PATH=/usr/local/share/playwright/
7
8# Install system dependencies
9RUN apt-get update && apt-get install -y \
10 curl \
11 unzip \
12 wget \
13 xvfb \
14 libnss3 \
15 libatk-bridge2.0-0 \
16 libgtk-3-0 \
17 libgbm-dev \
18 && apt-get clean \
19 && rm -rf /var/lib/apt/lists/*
20
21# Install Python dependencies
22COPY requirements.txt .
23RUN pip install --no-cache-dir -r requirements.txt
24
25# Install Playwright (optional if using Pyppeteer)
26RUN pip install playwright && playwright install --with-deps
27
28# Create a directory for the actor's files
29RUN mkdir -p /usr/src/app
30WORKDIR /usr/src/app
31
32# Copy all files from the current directory into the container
33COPY . .
34
35
36# Command to run the application
37CMD ["python", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "website-screenshot-crawler",
4 "title": "Automated Website Screenshot Crawler",
5 "description": "This actor takes screenshots of specified websites, uploads them to Apify Key-Value Store, and provides URLs for easy access. It is useful for monitoring website changes, capturing visuals for reports, or web archiving.",
6 "version": "1.0",
7 "meta": {
8 "templateId": "puppeteer_crawler"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile",
12 "buildTag": "latest",
13 "minMemoryMbytes": 4096,
14 "readme": "./README.md",
15 "storages": {
16 "dataset": {
17 "actorSpecification": 1,
18 "title": "Results",
19 "views": {
20 "results": {
21 "title": "results to scan",
22 "transformation": {
23 "fields": ["screenshot_url", "linkUrl"]
24 },
25 "display": {
26 "component": "table",
27 "properties": {
28 "linkUrl": {
29 "label": "linkUrl",
30 "format": "link"
31 },
32 "screenshot_url": {
33 "label": "screenshot_url",
34 "format": "image"
35 }
36 }
37 }
38 }
39 }
40 }
41 }
42}
.actor/input_schema.json
1{
2 "$schema": "http://json-schema.org/draft-07/schema#",
3 "title": "Input Schema",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "fullPage": {
8 "title": "Full Page Screenshot",
9 "type": "boolean",
10 "description": "Indicates whether the screenshot should capture the entire height of the page.",
11 "default": false
12 },
13 "link_urls": {
14 "title": "Link URLs",
15 "type": "array",
16 "description": "A list of URLs to capture screenshots from.",
17 "default": ["https://apify.com"],
18 "editor": "stringList"
19 },
20 "window_Width": {
21 "title": "Browser Window Width",
22 "type": "integer",
23 "description": "The width of the browser window in pixels.",
24 "default": 1920,
25 "unit": "px"
26 },
27 "window_Height": {
28 "title": "Browser Window Height",
29 "type": "integer",
30 "description": "The height of the browser window in pixels.",
31 "default": 1080,
32 "unit": "px"
33 },
34 "Sleep": {
35 "type": "integer",
36 "title": "Sleep Duration",
37 "description": "The duration (in seconds) to wait after loading a page before taking a screenshot.",
38 "default": 10,
39 "editor": "number"
40 },
41 "waitUntil": {
42 "title": "Navigation Wait Condition",
43 "type": "string",
44 "description": "Specify when the navigation should be considered finished. Options are 'load' for when the load event is fired, or 'domcontentloaded' for when the DOM has been loaded.",
45 "editor": "select",
46 "default": "networkidle0",
47 "enum": ["load", "domcontentloaded", "networkidle0", "networkidle2"],
48 "enumTitles": [
49 "Load (all resources loaded)",
50 "DOM Content Loaded (HTML parsed)",
51 "Network Idle (no network connections)",
52 "Network Idle (minimal network connections)"
53 ]
54 },
55 "cookies": {
56 "sectionCaption": "Cookies",
57 "sectionDescription": "You can use cookie editors such as [Cookie Editor](https://cookie-editor.com/) or [Copy Cookies](https://chromewebstore.google.com/detail/copy-cookies/jcbpglbplpblnagieibnemmkiamekcdg) to format cookies.",
58 "title": "Cookies",
59 "type": "array",
60 "description": "Cookies to be used for the browsing session, formatted as JSON objects.",
61 "editor": "json"
62 },
63 "scrollToBottom": {
64 "sectionCaption": "Scrolling Option",
65 "title": "Enable Scrolling to Bottom",
66 "type": "boolean",
67 "description": "Determines whether the page should be scrolled to the bottom before taking a screenshot.",
68 "default": false
69 },
70 "distance": {
71 "title": "Scrolling Distance",
72 "type": "integer",
73 "description": "The distance (in pixels) to scroll down the page during each scroll action. This controls how much content is revealed with each step.",
74 "default": 100,
75 "maximum": 1000,
76 "unit": "px"
77 },
78 "delay": {
79 "title": "Scrolling Delay",
80 "type": "integer",
81 "description": "The delay (in milliseconds) to wait after each scroll action. This can be adjusted based on how quickly content loads after scrolling.",
82 "default": 100,
83 "maximum": 5000,
84 "unit": "ms"
85 }
86 },
87 "required": ["link_urls"],
88 "additionalProperties": true
89}
src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint.
6asyncio.run(main())
src/main.py
1import os
2import random
3import string
4import asyncio
5import aiofiles
6from apify import Actor
7from pyppeteer import launch
8
9class MyActor:
10 def __init__(self, actor_input):
11 self.link_urls = actor_input.get('link_urls')
12 self.sleep_duration = actor_input.get('Sleep') # Default to 10 seconds if not provided
13 self.waitUntil = actor_input.get('waitUntil') # Default to 'networkidle2' if not provided
14 self.cookies = actor_input.get('cookies', [])
15 self.fullPage = actor_input.get('fullPage') # Default to True if not provided
16 self.window_Width = actor_input.get('window_Width') # Default width
17 self.window_Height = actor_input.get('window_Height') # Default height
18
19 self.scrollToBottom = actor_input.get('scrollToBottom') # Default to True
20 # The distance to scroll down the page in pixels for each scroll action.
21 self.distance = actor_input.get('distance') # Configurable scroll distance per step
22
23 # The delay in milliseconds to wait after each scroll action.
24 self.delay = actor_input.get('delay') # Configurable delay between scroll actions
25
26 async def configure_browser(self):
27 """Configures the Pyppeteer browser with the necessary options."""
28 browser = await launch({
29 'headless': True, # Set to False if you want to see the browser
30 'args': [
31 '--no-sandbox',
32 '--disable-setuid-sandbox',
33 '--disable-dev-shm-usage',
34 '--disable-gpu',
35 '--blink-settings=imagesEnabled=false', # Disable image loading
36 ]
37 })
38 return browser
39
40 async def scroll_to_bottom(self, page):
41 """Scrolls to the bottom of the page."""
42 await page.evaluate(f"""
43 async () => {{
44 const distance = {self.distance}; // Scroll by a configurable distance
45 const delay = {self.delay}; // Wait for a configurable delay after each scroll
46 while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {{
47 document.scrollingElement.scrollBy(0, distance);
48 await new Promise(resolve => setTimeout(resolve, delay));
49 }}
50 }}
51 """)
52
53 async def process_link(self, page, link_url):
54 """Processes a single URL: open page, optionally scroll to bottom, take a screenshot, upload it."""
55 try:
56 # Set the cookies before navigating to the page
57 await page.setCookie(*self.cookies)
58
59 # Set the viewport size to specified width and height
60 await page.setViewport({'width': self.window_Width, 'height': self.window_Height})
61
62 # Open the page
63 await page.goto(link_url, {'waitUntil': self.waitUntil}) # Wait until the page is fully loaded
64
65 # Conditionally scroll to the bottom of the page
66 if self.scrollToBottom:
67 await self.scroll_to_bottom(page)
68
69 # Wait for the page to load after scrolling
70 await asyncio.sleep(self.sleep_duration)
71
72 # Generate a random screenshot name
73 screenshot_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16)) + '.png'
74 screenshot_path = f'Image_Files/{screenshot_name}'
75
76 # Take a full-page screenshot
77 await page.screenshot({'path': screenshot_path, 'fullPage': self.fullPage})
78
79 # Upload the screenshot to Apify's Key-Value Store
80 async with aiofiles.open(screenshot_path, 'rb') as f:
81 screenshot_content = await f.read()
82
83 store_id = Actor.config.default_key_value_store_id
84 await Actor.set_value(screenshot_name, screenshot_content)
85
86 screenshot_url = f'https://api.apify.com/v2/key-value-stores/{store_id}/records/{screenshot_name}'
87 await Actor.push_data({'screenshot_url': screenshot_url, 'linkUrl': link_url})
88
89 print(f'Screenshot for {link_url} saved.')
90
91 except Exception as e:
92 print(f"Error processing {link_url}: {str(e)}")
93
94 async def run(self):
95 """Main execution logic."""
96 os.makedirs('Image_Files', exist_ok=True)
97
98 # Configure and launch the browser
99 browser = await self.configure_browser()
100 page = await browser.newPage()
101
102 for link_url in self.link_urls:
103 await self.process_link(page, link_url)
104
105 await browser.close()
106
107async def main():
108 async with Actor:
109 actor_input = await Actor.get_input() or {}
110 my_actor = MyActor(actor_input)
111 await my_actor.run()
112
113if __name__ == "__main__":
114 asyncio.run(main())
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify==2.0.0
5pyppeteer
6aiofiles
7requests
8tenacity
Developer
Maintained by Community
Actor Metrics
2 monthly users
-
2 stars
85% runs succeeded
Created in Oct 2024
Modified 3 months ago
Categories