Selenium with wait
Pricing
Pay per usage
Go to Store
Selenium with wait
Runs a simple selenium-based scrape of a site, but waits a given amount of time for the broswer to load the page
0.0 (0)
Pricing
Pay per usage
2
Total users
38
Monthly users
7
Runs succeeded
>99%
Last modified
2 months ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python-selenium:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-2", "title": "Getting started with Python and Selenium", "description": "Scrapes titles of websites using Selenium.", "version": "0.0", "meta": { "templateId": "python-selenium" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "URLs and their titles", "views": { "titles": { "title": "URLs and their titles", "transformation": { "fields": [ "url", "title" ] }, "display": { "component": "table", "properties": { "url": { "label": "URL", "format": "text" }, "title": { "label": "Title", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "Python Selenium Scraper", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources" }, "max_depth": { "title": "Maximum depth", "type": "integer", "description": "Depth to which to scrape to", "default": 1 } }, "required": ["start_urls"]}
src/__main__.py
1"""2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging3settings. The `main()` coroutine is then executed using `asyncio.run()`.4
5Feel free to modify this file to suit your specific needs.6"""7
8import asyncio9import logging10
11from apify.log import ActorLogFormatter12
13from .main import main14
15# Configure loggers16handler = logging.StreamHandler()17handler.setFormatter(ActorLogFormatter())18
19apify_client_logger = logging.getLogger('apify_client')20apify_client_logger.setLevel(logging.INFO)21apify_client_logger.addHandler(handler)22
23apify_logger = logging.getLogger('apify')24apify_logger.setLevel(logging.DEBUG)25apify_logger.addHandler(handler)26
27# Execute the Actor main coroutine28asyncio.run(main())
src/main.py
1"""2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.3
4Feel free to modify this file to suit your specific needs.5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:7https://docs.apify.com/sdk/python8"""9
10from urllib.parse import urljoin11from time import sleep12
13from selenium import webdriver14from selenium.webdriver.chrome.options import Options as ChromeOptions15from selenium.webdriver.common.by import By16
17from apify import Actor18
19# To run this Actor locally, you need to have the Selenium Chromedriver installed.20# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/21# When running on the Apify platform, it is already included in the Actor's Docker image.22
23
24async def main() -> None:25 """26 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function27 out of it, it will not work. Asynchronous execution is required for communication with Apify platform,28 and it also enhances performance in the field of web scraping significantly.29 """30 31 async with Actor:32 # Read the Actor input33 actor_input = await Actor.get_input() or {}34 start_urls = actor_input.get('start_urls', [35 {'url': 'https://knowledge.alteryx.com/index/s/article/How-to-restart-Alteryx-Service-remotely-when-RDP-is-not-available'}36 ])37 max_depth = actor_input.get('max_depth', 1)38 wait_time = actor_input.get('wait_time_in_seconds', 5)39 cookies = actor_input.get('cookies', [])40 if not isinstance(wait_time, int):41 Actor.log.info('Sleep time is not a number. Setting default...')42 wait_time = 543
44 if not start_urls:45 Actor.log.info('No start URLs specified in actor input, exiting...')46 await Actor.exit()47
48 # Enqueue the starting URLs in the default request queue49 default_queue = await Actor.open_request_queue()50 for start_url in start_urls:51 url = start_url.get('url')52 Actor.log.info(f'Enqueuing {url} ...')53 await default_queue.add_request({'url': url, 'userData': {'depth': 0}})54
55 # Launch a new Selenium Chrome WebDriver56 Actor.log.info('Launching Chrome WebDriver...')57 chrome_options = ChromeOptions()58 if Actor.config.headless:59 chrome_options.add_argument('--headless')60 chrome_options.add_argument('--no-sandbox')61 chrome_options.add_argument('--disable-dev-shm-usage')62 driver = webdriver.Chrome(options=chrome_options)63
64 # TODO: Maybe i can comment this ones?65 driver.get('http://www.example.com')66 assert driver.title == 'Example Domain'67
68 # Add cookies69 Actor.log.debug(f"Num of cookies to add: {len(cookies)}")70 for cookie in cookies:71 driver.add_cookie(cookie)72
73 # Refresh the page to apply cookies74 driver.refresh()75
76 # Process the requests in the queue one by one77 while request := await default_queue.fetch_next_request():78 url = request['url']79 depth = request['userData']['depth']80 Actor.log.info(f'Scraping {url} ...')81
82 try:83 # Open the URL in the Selenium WebDriver84 driver.get(url)85 Actor.log.info(f'Sleeping for this much time: {wait_time} seconds ...')86 sleep(wait_time)87
88 # Push the title of the page into the default dataset89 title = driver.title90 html_content = driver.page_source91 Actor.log.info("Got title and html content")92 await Actor.push_data({'url': url, 'title': title, 'text': html_content})93 except Exception:94 Actor.log.exception(f'Cannot extract data from {url}.')95 finally:96 await default_queue.mark_request_as_handled(request)97
98 driver.quit()
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache.ruff_cache
.scrapy*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify ~= 1.7.05selenium ~= 4.14.0