Example Website Screenshot Crawler avatar
Example Website Screenshot Crawler

Pricing

Pay per usage

Go to Store
Example Website Screenshot Crawler

Example Website Screenshot Crawler

Developed by

dz_omar

dz_omar

Maintained by Community

Automated website screenshot crawler using Pyppeteer and Apify. This open-source actor captures screenshots from specified URLs, uploads them to the Apify Key-Value Store, and provides easy access to the results, making it ideal for monitoring website changes and archiving web content.

0.0 (0)

Pricing

Pay per usage

6

Total users

50

Monthly users

11

Runs succeeded

>99%

Last modified

8 months ago

.actor/Dockerfile

# Use the official Apify Python base image
FROM python:3.10-slim
# Set environment variables
ENV PIP_NO_CACHE_DIR=1
ENV PLAYWRIGHT_BROWSERS_PATH=/usr/local/share/playwright/
# Install system dependencies
RUN apt-get update && apt-get install -y \
curl \
unzip \
wget \
xvfb \
libnss3 \
libatk-bridge2.0-0 \
libgtk-3-0 \
libgbm-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Install Playwright (optional if using Pyppeteer)
RUN pip install playwright && playwright install --with-deps
# Create a directory for the actor's files
RUN mkdir -p /usr/src/app
WORKDIR /usr/src/app
# Copy all files from the current directory into the container
COPY . .
# Command to run the application
CMD ["python", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "website-screenshot-crawler",
"title": "Automated Website Screenshot Crawler",
"description": "This actor takes screenshots of specified websites, uploads them to Apify Key-Value Store, and provides URLs for easy access. It is useful for monitoring website changes, capturing visuals for reports, or web archiving.",
"version": "1.0",
"meta": {
"templateId": "puppeteer_crawler"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"buildTag": "latest",
"minMemoryMbytes": 4096,
"readme": "./README.md",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "Results",
"views": {
"results": {
"title": "results to scan",
"transformation": {
"fields": ["screenshot_url", "linkUrl"]
},
"display": {
"component": "table",
"properties": {
"linkUrl": {
"label": "linkUrl",
"format": "link"
},
"screenshot_url": {
"label": "screenshot_url",
"format": "image"
}
}
}
}
}
}
}
}

.actor/input_schema.json

{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Input Schema",
"type": "object",
"schemaVersion": 1,
"properties": {
"fullPage": {
"title": "Full Page Screenshot",
"type": "boolean",
"description": "Indicates whether the screenshot should capture the entire height of the page.",
"default": false
},
"link_urls": {
"title": "Link URLs",
"type": "array",
"description": "A list of URLs to capture screenshots from.",
"default": ["https://apify.com"],
"editor": "stringList"
},
"window_Width": {
"title": "Browser Window Width",
"type": "integer",
"description": "The width of the browser window in pixels.",
"default": 1920,
"unit": "px"
},
"window_Height": {
"title": "Browser Window Height",
"type": "integer",
"description": "The height of the browser window in pixels.",
"default": 1080,
"unit": "px"
},
"Sleep": {
"type": "integer",
"title": "Sleep Duration",
"description": "The duration (in seconds) to wait after loading a page before taking a screenshot.",
"default": 10,
"editor": "number"
},
"waitUntil": {
"title": "Navigation Wait Condition",
"type": "string",
"description": "Specify when the navigation should be considered finished. Options are 'load' for when the load event is fired, or 'domcontentloaded' for when the DOM has been loaded.",
"editor": "select",
"default": "networkidle0",
"enum": ["load", "domcontentloaded", "networkidle0", "networkidle2"],
"enumTitles": [
"Load (all resources loaded)",
"DOM Content Loaded (HTML parsed)",
"Network Idle (no network connections)",
"Network Idle (minimal network connections)"
]
},
"cookies": {
"sectionCaption": "Cookies",
"sectionDescription": "You can use cookie editors such as [Cookie Editor](https://cookie-editor.com/) or [Copy Cookies](https://chromewebstore.google.com/detail/copy-cookies/jcbpglbplpblnagieibnemmkiamekcdg) to format cookies.",
"title": "Cookies",
"type": "array",
"description": "Cookies to be used for the browsing session, formatted as JSON objects.",
"editor": "json"
},
"scrollToBottom": {
"sectionCaption": "Scrolling Option",
"title": "Enable Scrolling to Bottom",
"type": "boolean",
"description": "Determines whether the page should be scrolled to the bottom before taking a screenshot.",
"default": false
},
"distance": {
"title": "Scrolling Distance",
"type": "integer",
"description": "The distance (in pixels) to scroll down the page during each scroll action. This controls how much content is revealed with each step.",
"default": 100,
"maximum": 1000,
"unit": "px"
},
"delay": {
"title": "Scrolling Delay",
"type": "integer",
"description": "The delay (in milliseconds) to wait after each scroll action. This can be adjusted based on how quickly content loads after scrolling.",
"default": 100,
"maximum": 5000,
"unit": "ms"
}
},
"required": ["link_urls"],
"additionalProperties": true
}

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint.
6asyncio.run(main())

src/main.py

1import os
2import random
3import string
4import asyncio
5import aiofiles
6from apify import Actor
7from pyppeteer import launch
8
9class MyActor:
10 def __init__(self, actor_input):
11 self.link_urls = actor_input.get('link_urls')
12 self.sleep_duration = actor_input.get('Sleep') # Default to 10 seconds if not provided
13 self.waitUntil = actor_input.get('waitUntil') # Default to 'networkidle2' if not provided
14 self.cookies = actor_input.get('cookies', [])
15 self.fullPage = actor_input.get('fullPage') # Default to True if not provided
16 self.window_Width = actor_input.get('window_Width') # Default width
17 self.window_Height = actor_input.get('window_Height') # Default height
18
19 self.scrollToBottom = actor_input.get('scrollToBottom') # Default to True
20 # The distance to scroll down the page in pixels for each scroll action.
21 self.distance = actor_input.get('distance') # Configurable scroll distance per step
22
23 # The delay in milliseconds to wait after each scroll action.
24 self.delay = actor_input.get('delay') # Configurable delay between scroll actions
25
26 async def configure_browser(self):
27 """Configures the Pyppeteer browser with the necessary options."""
28 browser = await launch({
29 'headless': True, # Set to False if you want to see the browser
30 'args': [
31 '--no-sandbox',
32 '--disable-setuid-sandbox',
33 '--disable-dev-shm-usage',
34 '--disable-gpu',
35 '--blink-settings=imagesEnabled=false', # Disable image loading
36 ]
37 })
38 return browser
39
40 async def scroll_to_bottom(self, page):
41 """Scrolls to the bottom of the page."""
42 await page.evaluate(f"""
43 async () => {{
44 const distance = {self.distance}; // Scroll by a configurable distance
45 const delay = {self.delay}; // Wait for a configurable delay after each scroll
46 while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {{
47 document.scrollingElement.scrollBy(0, distance);
48 await new Promise(resolve => setTimeout(resolve, delay));
49 }}
50 }}
51 """)
52
53 async def process_link(self, page, link_url):
54 """Processes a single URL: open page, optionally scroll to bottom, take a screenshot, upload it."""
55 try:
56 # Set the cookies before navigating to the page
57 await page.setCookie(*self.cookies)
58
59 # Set the viewport size to specified width and height
60 await page.setViewport({'width': self.window_Width, 'height': self.window_Height})
61
62 # Open the page
63 await page.goto(link_url, {'waitUntil': self.waitUntil}) # Wait until the page is fully loaded
64
65 # Conditionally scroll to the bottom of the page
66 if self.scrollToBottom:
67 await self.scroll_to_bottom(page)
68
69 # Wait for the page to load after scrolling
70 await asyncio.sleep(self.sleep_duration)
71
72 # Generate a random screenshot name
73 screenshot_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16)) + '.png'
74 screenshot_path = f'Image_Files/{screenshot_name}'
75
76 # Take a full-page screenshot
77 await page.screenshot({'path': screenshot_path, 'fullPage': self.fullPage})
78
79 # Upload the screenshot to Apify's Key-Value Store
80 async with aiofiles.open(screenshot_path, 'rb') as f:
81 screenshot_content = await f.read()
82
83 store_id = Actor.config.default_key_value_store_id
84 await Actor.set_value(screenshot_name, screenshot_content)
85
86 screenshot_url = f'https://api.apify.com/v2/key-value-stores/{store_id}/records/{screenshot_name}'
87 await Actor.push_data({'screenshot_url': screenshot_url, 'linkUrl': link_url})
88
89 print(f'Screenshot for {link_url} saved.')
90
91 except Exception as e:
92 print(f"Error processing {link_url}: {str(e)}")
93
94 async def run(self):
95 """Main execution logic."""
96 os.makedirs('Image_Files', exist_ok=True)
97
98 # Configure and launch the browser
99 browser = await self.configure_browser()
100 page = await browser.newPage()
101
102 for link_url in self.link_urls:
103 await self.process_link(page, link_url)
104
105 await browser.close()
106
107async def main():
108 async with Actor:
109 actor_input = await Actor.get_input() or {}
110 my_actor = MyActor(actor_input)
111 await my_actor.run()
112
113if __name__ == "__main__":
114 asyncio.run(main())

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify==2.0.0
5pyppeteer
6aiofiles
7requests
8tenacity