Example Website Screenshot Crawler avatar
Example Website Screenshot Crawler

Pricing

Pay per usage

Go to Store
Example Website Screenshot Crawler

Example Website Screenshot Crawler

dz_omar/example-website-screenshot-crawler

Developed by

Abdlhakim hefaia

Maintained by Community

Automated website screenshot crawler using Pyppeteer and Apify. This open-source actor captures screenshots from specified URLs, uploads them to the Apify Key-Value Store, and provides easy access to the results, making it ideal for monitoring website changes and archiving web content.

0.0 (0)

Pricing

Pay per usage

5

Monthly users

10

Runs succeeded

>99%

Last modified

5 months ago

.actor/Dockerfile

1# Use the official Apify Python base image
2FROM python:3.10-slim
3
4# Set environment variables
5ENV PIP_NO_CACHE_DIR=1
6ENV PLAYWRIGHT_BROWSERS_PATH=/usr/local/share/playwright/
7
8# Install system dependencies
9RUN apt-get update && apt-get install -y \
10    curl \
11    unzip \
12    wget \
13    xvfb \
14    libnss3 \
15    libatk-bridge2.0-0 \
16    libgtk-3-0 \
17    libgbm-dev \
18    && apt-get clean \
19    && rm -rf /var/lib/apt/lists/*
20
21# Install Python dependencies
22COPY requirements.txt .
23RUN pip install --no-cache-dir -r requirements.txt
24
25# Install Playwright (optional if using Pyppeteer)
26RUN pip install playwright && playwright install --with-deps
27
28# Create a directory for the actor's files
29RUN mkdir -p /usr/src/app
30WORKDIR /usr/src/app
31
32# Copy all files from the current directory into the container
33COPY . .
34
35
36# Command to run the application
37CMD ["python", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "website-screenshot-crawler",
4    "title": "Automated Website Screenshot Crawler",
5    "description": "This actor takes screenshots of specified websites, uploads them to Apify Key-Value Store, and provides URLs for easy access. It is useful for monitoring website changes, capturing visuals for reports, or web archiving.",
6    "version": "1.0",
7    "meta": {
8        "templateId": "puppeteer_crawler"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "buildTag": "latest",
13    "minMemoryMbytes": 4096,
14    "readme": "./README.md",
15    "storages": {
16        "dataset": {
17            "actorSpecification": 1,
18            "title": "Results",
19            "views": {
20                "results": {
21                    "title": "results to scan",
22                    "transformation": {
23                        "fields": ["screenshot_url", "linkUrl"]
24                    },
25                    "display": {
26                        "component": "table",
27                        "properties": {
28                            "linkUrl": {
29                                "label": "linkUrl",
30                                "format": "link"
31                            },
32                            "screenshot_url": {
33                                "label": "screenshot_url",
34                                "format": "image"
35                            }
36                        }
37                    }
38                }
39            }
40        }
41    }
42}

.actor/input_schema.json

1{
2  "$schema": "http://json-schema.org/draft-07/schema#",
3  "title": "Input Schema",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "fullPage": {
8      "title": "Full Page Screenshot",
9      "type": "boolean",
10      "description": "Indicates whether the screenshot should capture the entire height of the page.",
11      "default": false
12    },
13    "link_urls": {
14      "title": "Link URLs",
15      "type": "array",
16      "description": "A list of URLs to capture screenshots from.",
17      "default": ["https://apify.com"],
18      "editor": "stringList"
19    },
20    "window_Width": {
21      "title": "Browser Window Width",
22      "type": "integer",
23      "description": "The width of the browser window in pixels.",
24      "default": 1920,
25      "unit": "px"   
26    },
27    "window_Height": {
28      "title": "Browser Window Height",
29      "type": "integer",
30      "description": "The height of the browser window in pixels.",
31      "default": 1080,
32      "unit": "px"
33    },
34    "Sleep": {
35      "type": "integer",
36      "title": "Sleep Duration",
37      "description": "The duration (in seconds) to wait after loading a page before taking a screenshot.",
38      "default": 10,
39      "editor": "number"
40    },
41    "waitUntil": {
42      "title": "Navigation Wait Condition",
43      "type": "string",
44      "description": "Specify when the navigation should be considered finished. Options are 'load' for when the load event is fired, or 'domcontentloaded' for when the DOM has been loaded.",
45      "editor": "select",
46      "default": "networkidle0",
47      "enum": ["load", "domcontentloaded", "networkidle0", "networkidle2"],
48      "enumTitles": [
49        "Load (all resources loaded)", 
50        "DOM Content Loaded (HTML parsed)", 
51        "Network Idle (no network connections)", 
52        "Network Idle (minimal network connections)"
53      ]
54    },
55    "cookies": {
56      "sectionCaption": "Cookies",
57      "sectionDescription": "You can use cookie editors such as [Cookie Editor](https://cookie-editor.com/) or [Copy Cookies](https://chromewebstore.google.com/detail/copy-cookies/jcbpglbplpblnagieibnemmkiamekcdg) to format cookies.",
58      "title": "Cookies",
59      "type": "array",
60      "description": "Cookies to be used for the browsing session, formatted as JSON objects.",
61      "editor": "json"
62    },
63    "scrollToBottom": {
64      "sectionCaption": "Scrolling Option",
65      "title": "Enable Scrolling to Bottom",
66      "type": "boolean",
67      "description": "Determines whether the page should be scrolled to the bottom before taking a screenshot.",
68      "default": false
69    },
70    "distance": {
71      "title": "Scrolling Distance",
72      "type": "integer",
73      "description": "The distance (in pixels) to scroll down the page during each scroll action. This controls how much content is revealed with each step.",
74      "default": 100,  
75      "maximum": 1000, 
76      "unit": "px"
77    },
78    "delay": {
79      "title": "Scrolling Delay",
80      "type": "integer",
81      "description": "The delay (in milliseconds) to wait after each scroll action. This can be adjusted based on how quickly content loads after scrolling.",
82      "default": 100, 
83      "maximum": 5000, 
84      "unit": "ms"
85    }
86  },
87  "required": ["link_urls"],
88  "additionalProperties": true
89}

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entrypoint.
6asyncio.run(main())

src/main.py

1import os
2import random
3import string
4import asyncio
5import aiofiles
6from apify import Actor
7from pyppeteer import launch
8
9class MyActor:
10    def __init__(self, actor_input):
11        self.link_urls = actor_input.get('link_urls')
12        self.sleep_duration = actor_input.get('Sleep')  # Default to 10 seconds if not provided
13        self.waitUntil = actor_input.get('waitUntil')  # Default to 'networkidle2' if not provided     
14        self.cookies = actor_input.get('cookies', [])
15        self.fullPage = actor_input.get('fullPage')  # Default to True if not provided
16        self.window_Width = actor_input.get('window_Width')  # Default width
17        self.window_Height = actor_input.get('window_Height')  # Default height
18
19        self.scrollToBottom = actor_input.get('scrollToBottom')  # Default to True
20        # The distance to scroll down the page in pixels for each scroll action.
21        self.distance = actor_input.get('distance')  # Configurable scroll distance per step
22
23        # The delay in milliseconds to wait after each scroll action.
24        self.delay = actor_input.get('delay')  # Configurable delay between scroll actions
25
26    async def configure_browser(self):
27        """Configures the Pyppeteer browser with the necessary options."""
28        browser = await launch({
29            'headless': True,  # Set to False if you want to see the browser
30            'args': [
31                '--no-sandbox',
32                '--disable-setuid-sandbox',
33                '--disable-dev-shm-usage',
34                '--disable-gpu',
35                '--blink-settings=imagesEnabled=false',  # Disable image loading
36            ]
37        })
38        return browser
39
40    async def scroll_to_bottom(self, page):
41        """Scrolls to the bottom of the page."""
42        await page.evaluate(f"""
43            async () => {{
44                const distance = {self.distance};  // Scroll by a configurable distance
45                const delay = {self.delay};         // Wait for a configurable delay after each scroll
46                while (document.scrollingElement.scrollTop + window.innerHeight < document.scrollingElement.scrollHeight) {{
47                    document.scrollingElement.scrollBy(0, distance);
48                    await new Promise(resolve => setTimeout(resolve, delay));
49                }}
50            }}
51        """)
52
53    async def process_link(self, page, link_url):
54        """Processes a single URL: open page, optionally scroll to bottom, take a screenshot, upload it."""
55        try:
56            # Set the cookies before navigating to the page
57            await page.setCookie(*self.cookies)
58
59            # Set the viewport size to specified width and height
60            await page.setViewport({'width': self.window_Width, 'height': self.window_Height})
61
62            # Open the page
63            await page.goto(link_url, {'waitUntil': self.waitUntil})  # Wait until the page is fully loaded
64
65            # Conditionally scroll to the bottom of the page
66            if self.scrollToBottom:
67                await self.scroll_to_bottom(page)
68
69            # Wait for the page to load after scrolling
70            await asyncio.sleep(self.sleep_duration)
71
72            # Generate a random screenshot name
73            screenshot_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16)) + '.png'
74            screenshot_path = f'Image_Files/{screenshot_name}'
75
76            # Take a full-page screenshot
77            await page.screenshot({'path': screenshot_path, 'fullPage': self.fullPage})
78
79            # Upload the screenshot to Apify's Key-Value Store
80            async with aiofiles.open(screenshot_path, 'rb') as f:
81                screenshot_content = await f.read()
82
83            store_id = Actor.config.default_key_value_store_id
84            await Actor.set_value(screenshot_name, screenshot_content)
85
86            screenshot_url = f'https://api.apify.com/v2/key-value-stores/{store_id}/records/{screenshot_name}'
87            await Actor.push_data({'screenshot_url': screenshot_url, 'linkUrl': link_url})
88
89            print(f'Screenshot for {link_url} saved.')
90
91        except Exception as e:
92            print(f"Error processing {link_url}: {str(e)}")
93
94    async def run(self):
95        """Main execution logic."""
96        os.makedirs('Image_Files', exist_ok=True)
97        
98        # Configure and launch the browser
99        browser = await self.configure_browser()
100        page = await browser.newPage()
101
102        for link_url in self.link_urls:
103            await self.process_link(page, link_url)
104
105        await browser.close()
106
107async def main():
108    async with Actor:
109        actor_input = await Actor.get_input() or {}
110        my_actor = MyActor(actor_input)
111        await my_actor.run()
112
113if __name__ == "__main__":
114    asyncio.run(main())

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify==2.0.0
5pyppeteer
6aiofiles
7requests
8tenacity

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.