Login with Selenium
Pricing
Pay per usage
Go to Apify Store
Login with Selenium
DeprecatedDoes a simple login using selenium. returns cookies
0.0 (0)
Pricing
Pay per usage
0
3
1
Last modified
19 days ago
Pricing
Pay per usage
Does a simple login using selenium. returns cookies
0.0 (0)
Pricing
Pay per usage
0
3
1
Last modified
19 days ago
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python-selenium:3.13
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
{ "actorSpecification": 1, "name": "my-actor", "title": "Getting started with Python and Selenium", "description": "Scrapes titles of websites using Selenium.", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "python-selenium" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
{ "title": "Python Selenium Scraper", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources" }, "max_depth": { "title": "Maximum depth", "type": "integer", "description": "Depth to which to scrape to", "default": 1 } }, "required": ["start_urls"]}
1
1import asyncio2
3from .main import main4
5# Execute the Actor entry point.6asyncio.run(main())
1"""This module defines the main entry point for the Apify Actor.2
3Feel free to modify this file to suit your specific needs.4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:6https://docs.apify.com/sdk/python7"""8
9import asyncio10from urllib.parse import urljoin11
12from time import sleep13
14from apify import Actor, Request15from selenium import webdriver16from selenium.webdriver.chrome.service import Service17from selenium.webdriver.chrome.options import Options18from selenium.webdriver.common.by import By19from selenium.webdriver.common.keys import Keys20from selenium.webdriver.support.ui import WebDriverWait21from selenium.webdriver.support import expected_conditions as EC22from webdriver_manager.chrome import ChromeDriverManager23
24# To run this Actor locally, you need to have the Selenium Chromedriver installed.25# Follow the installation guide at:26# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/27# When running on the Apify platform, the Chromedriver is already included28# in the Actor's Docker image.29
30
31async def main() -> None:32 """33
34 Returns cookies for a given site, after login35
36 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.37 Asynchronous execution is required for communication with Apify platform, and it also enhances performance in38 the field of web scraping significantly.39 """40 # Enter the context of the Actor.41 async with Actor:42 # Retrieve the Actor input, and use default values if not provided.43 actor_input = await Actor.get_input() or {}44 start_urls = actor_input.get('start_urls')45 credentials = actor_input.get('credentials')46
47 if not start_urls:48 Actor.log.error('No start_urls specified in actor input, exiting...')49 await Actor.exit()50 login_url = start_urls[0]51
52 if not login_url:53 Actor.log.error('No login url specified in actor input, exiting...')54 await Actor.exit()55
56 if not credentials:57 Actor.log.error('No credentials specified in actor input, exiting...')58 await Actor.exit()59
60 user_info = credentials.get("user", {})61 password_info = credentials.get("password", {})62 submit_info = credentials.get("submit", {})63 extra_info = credentials.get("extra", {})64
65 user = user_info.get("value")66 password = password_info.get("value")67
68 if not (user and password):69 Actor.log.info('User/Password info is not complete, exiting...')70 await Actor.exit()71
72 # xpaths info73 xpath_user = user_info.get("xpath")74 xpath_password = password_info.get("xpath")75 xpath_submit = submit_info.get("xpath")76
77 if not (xpath_user and xpath_password and xpath_submit):78 Actor.log.info('XPath info for user/password/submit is not complete, exiting...')79 await Actor.exit()80
81 Actor.log.debug("All info ok")82
83 # Optionals: wait time and OK page marker84 wait_time = extra_info.get("wait_time", 5)85 if isinstance(wait_time, (str, float)):86 wait_time = int(wait_time)87
88 ok_page_xpath = extra_info.get("ok_page_xpath")89
90 # We got all we need, so we can start91
92 # Enqueue the Login URL in the default request queue93 queue_name = "my-login-queue"94 request_queue = await Actor.open_request_queue(name=queue_name)95
96 # Delete everything in the queue to start clean97 await request_queue.drop() # WARNING: deletes the queue98 request_queue = await Actor.open_request_queue(name=queue_name)99 100 for start_url in start_urls:101 url = start_url.get('url')102 Actor.log.info(f'Enqueuing {url} ...')103 new_request = Request.from_url(url, user_data={'depth': 0}, unique_key="login")104 await request_queue.add_request(new_request)105
106 # Launch a new Selenium Chrome WebDriver107 Actor.log.info('Launching Chrome Headless WebDriver...')108 chrome_options = Options()109 chrome_options.add_argument('--headless')110 chrome_options.add_argument('--no-sandbox')111 chrome_options.add_argument('--disable-dev-shm-usage')112 driver = webdriver.Chrome(options=chrome_options)113
114 # Process the requests in the queue one by one115 while request := await request_queue.fetch_next_request():116 url = request.url117 Actor.log.info(f'Login check: {url} ...')118
119 try:120 # Open the URL in the Selenium WebDriver121 driver.get(url)122 Actor.log.info(f'Sleeping for this much time: {wait_time} seconds ...')123 # Wait for the login form to appear124 wait = WebDriverWait(driver, wait_time)125
126 # Find elements127 username_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_user)))128 password_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_password)))129 submit_button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath_submit)))130
131 # Populate132 username_input.send_keys(user)133 password_input.send_keys(password)134
135 # Click and wait136 submit_button.click()137 wait = WebDriverWait(driver, wait_time)138 if ok_page_xpath:139 wait.until(EC.presence_of_element_located((By.XPATH, ok_page_xpath)))140 else:141 sleep(wait_time)142 Actor.log.info("Wake up!")143
144
145 await Actor.push_data({'url': url, 'cookies': driver.get_cookies()})146 except Exception:147 Actor.log.exception(f'Cannot login: URL is {url}.')148 finally:149 await request_queue.mark_request_as_handled(request)150
151 driver.quit()
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify >= 1.7.05selenium ~= 4.14.06webdriver_manager