Login with Selenium
Deprecated
Pricing
Pay per usage
Go to Store
Login with Selenium
Deprecated
Does a simple login using selenium. returns cookies
0.0 (0)
Pricing
Pay per usage
0
Total users
3
Monthly users
2
Runs succeeded
>99%
Last modified
2 months ago
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python-selenium:3.13
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "my-actor",
4 "title": "Getting started with Python and Selenium",
5 "description": "Scrapes titles of websites using Selenium.",
6 "version": "0.0",
7 "buildTag": "latest",
8 "meta": {
9 "templateId": "python-selenium"
10 },
11 "input": "./input_schema.json",
12 "dockerfile": "./Dockerfile"
13}
.actor/input_schema.json
1{
2 "title": "Python Selenium Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "start_urls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with",
10 "prefill": [
11 { "url": "https://apify.com" }
12 ],
13 "editor": "requestListSources"
14 },
15 "max_depth": {
16 "title": "Maximum depth",
17 "type": "integer",
18 "description": "Depth to which to scrape to",
19 "default": 1
20 }
21 },
22 "required": ["start_urls"]
23}
src/__init__.py
src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())
src/main.py
1"""This module defines the main entry point for the Apify Actor.
2
3Feel free to modify this file to suit your specific needs.
4
5To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
6https://docs.apify.com/sdk/python
7"""
8
9import asyncio
10from urllib.parse import urljoin
11
12from time import sleep
13
14from apify import Actor, Request
15from selenium import webdriver
16from selenium.webdriver.chrome.service import Service
17from selenium.webdriver.chrome.options import Options
18from selenium.webdriver.common.by import By
19from selenium.webdriver.common.keys import Keys
20from selenium.webdriver.support.ui import WebDriverWait
21from selenium.webdriver.support import expected_conditions as EC
22from webdriver_manager.chrome import ChromeDriverManager
23
24# To run this Actor locally, you need to have the Selenium Chromedriver installed.
25# Follow the installation guide at:
26# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
27# When running on the Apify platform, the Chromedriver is already included
28# in the Actor's Docker image.
29
30
31async def main() -> None:
32 """
33
34 Returns cookies for a given site, after login
35
36 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
37 Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
38 the field of web scraping significantly.
39 """
40 # Enter the context of the Actor.
41 async with Actor:
42 # Retrieve the Actor input, and use default values if not provided.
43 actor_input = await Actor.get_input() or {}
44 start_urls = actor_input.get('start_urls')
45 credentials = actor_input.get('credentials')
46
47 if not start_urls:
48 Actor.log.error('No start_urls specified in actor input, exiting...')
49 await Actor.exit()
50 login_url = start_urls[0]
51
52 if not login_url:
53 Actor.log.error('No login url specified in actor input, exiting...')
54 await Actor.exit()
55
56 if not credentials:
57 Actor.log.error('No credentials specified in actor input, exiting...')
58 await Actor.exit()
59
60 user_info = credentials.get("user", {})
61 password_info = credentials.get("password", {})
62 submit_info = credentials.get("submit", {})
63 extra_info = credentials.get("extra", {})
64
65 user = user_info.get("value")
66 password = password_info.get("value")
67
68 if not (user and password):
69 Actor.log.info('User/Password info is not complete, exiting...')
70 await Actor.exit()
71
72 # xpaths info
73 xpath_user = user_info.get("xpath")
74 xpath_password = password_info.get("xpath")
75 xpath_submit = submit_info.get("xpath")
76
77 if not (xpath_user and xpath_password and xpath_submit):
78 Actor.log.info('XPath info for user/password/submit is not complete, exiting...')
79 await Actor.exit()
80
81 Actor.log.debug("All info ok")
82
83 # Optionals: wait time and OK page marker
84 wait_time = extra_info.get("wait_time", 5)
85 if isinstance(wait_time, (str, float)):
86 wait_time = int(wait_time)
87
88 ok_page_xpath = extra_info.get("ok_page_xpath")
89
90 # We got all we need, so we can start
91
92 # Enqueue the Login URL in the default request queue
93 request_queue = await Actor.open_request_queue()
94 for start_url in start_urls:
95 url = start_url.get('url')
96 Actor.log.info(f'Enqueuing {url} ...')
97 new_request = Request.from_url(url, user_data={'depth': 0})
98 await request_queue.add_request(new_request)
99
100 # Launch a new Selenium Chrome WebDriver
101 Actor.log.info('Launching Chrome Headless WebDriver...')
102 chrome_options = Options()
103 chrome_options.add_argument('--headless')
104 chrome_options.add_argument('--no-sandbox')
105 chrome_options.add_argument('--disable-dev-shm-usage')
106 driver = webdriver.Chrome(options=chrome_options)
107
108 # Process the requests in the queue one by one
109 while request := await request_queue.fetch_next_request():
110 url = request.url
111 Actor.log.info(f'Login check: {url} ...')
112
113 try:
114 # Open the URL in the Selenium WebDriver
115 driver.get(url)
116 Actor.log.info(f'Sleeping for this much time: {wait_time} seconds ...')
117 # Wait for the login form to appear
118 wait = WebDriverWait(driver, wait_time)
119
120 # Find elements
121 username_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_user)))
122 password_input = wait.until(EC.presence_of_element_located((By.XPATH, xpath_password)))
123 submit_button = wait.until(EC.element_to_be_clickable((By.XPATH, xpath_submit)))
124
125 # Populate
126 username_input.send_keys(user)
127 password_input.send_keys(password)
128
129 # Click and wait
130 submit_button.click()
131 wait = WebDriverWait(driver, wait_time)
132 if ok_page_xpath:
133 wait.until(EC.presence_of_element_located((By.XPATH, ok_page_xpath)))
134 else:
135 sleep(wait_time)
136 Actor.log.info("Wake up!")
137
138
139 await Actor.push_data({'url': url, 'cookies': driver.get_cookies()})
140 except Exception:
141 Actor.log.exception(f'Cannot login: URL is {url}.')
142 finally:
143 await request_queue.mark_request_as_handled(request)
144
145 driver.quit()
src/py.typed
1
.dockerignore
1.git
2.mise.toml
3.nvim.lua
4storage
5
6# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
7
8# Byte-compiled / optimized / DLL files
9__pycache__/
10*.py[cod]
11*$py.class
12
13# C extensions
14*.so
15
16# Distribution / packaging
17.Python
18build/
19develop-eggs/
20dist/
21downloads/
22eggs/
23.eggs/
24lib/
25lib64/
26parts/
27sdist/
28var/
29wheels/
30share/python-wheels/
31*.egg-info/
32.installed.cfg
33*.egg
34MANIFEST
35
36# PyInstaller
37# Usually these files are written by a python script from a template
38# before PyInstaller builds the exe, so as to inject date/other infos into it.
39*.manifest
40*.spec
41
42# Installer logs
43pip-log.txt
44pip-delete-this-directory.txt
45
46# Unit test / coverage reports
47htmlcov/
48.tox/
49.nox/
50.coverage
51.coverage.*
52.cache
53nosetests.xml
54coverage.xml
55*.cover
56*.py,cover
57.hypothesis/
58.pytest_cache/
59cover/
60
61# Translations
62*.mo
63*.pot
64
65# Django stuff:
66*.log
67local_settings.py
68db.sqlite3
69db.sqlite3-journal
70
71# Flask stuff:
72instance/
73.webassets-cache
74
75# Scrapy stuff:
76.scrapy
77
78# Sphinx documentation
79docs/_build/
80
81# PyBuilder
82.pybuilder/
83target/
84
85# Jupyter Notebook
86.ipynb_checkpoints
87
88# IPython
89profile_default/
90ipython_config.py
91
92# pyenv
93# For a library or package, you might want to ignore these files since the code is
94# intended to run in multiple environments; otherwise, check them in:
95.python-version
96
97# pdm
98# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
99#pdm.lock
100# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
101# in version control.
102# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
103.pdm.toml
104.pdm-python
105.pdm-build/
106
107# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
108__pypackages__/
109
110# Celery stuff
111celerybeat-schedule
112celerybeat.pid
113
114# SageMath parsed files
115*.sage.py
116
117# Environments
118.env
119.venv
120env/
121venv/
122ENV/
123env.bak/
124venv.bak/
125
126# Spyder project settings
127.spyderproject
128.spyproject
129
130# Rope project settings
131.ropeproject
132
133# mkdocs documentation
134/site
135
136# mypy
137.mypy_cache/
138.dmypy.json
139dmypy.json
140
141# Pyre type checker
142.pyre/
143
144# pytype static type analyzer
145.pytype/
146
147# Cython debug symbols
148cython_debug/
149
150# PyCharm
151# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153# and can be added to the global gitignore or merged into this file. For a more nuclear
154# option (not recommended) you can uncomment the following to ignore the entire idea folder.
155.idea/
.gitignore
1.mise.toml
2.nvim.lua
3storage
4
5# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
6
7# Byte-compiled / optimized / DLL files
8__pycache__/
9*.py[cod]
10*$py.class
11
12# C extensions
13*.so
14
15# Distribution / packaging
16.Python
17build/
18develop-eggs/
19dist/
20downloads/
21eggs/
22.eggs/
23lib/
24lib64/
25parts/
26sdist/
27var/
28wheels/
29share/python-wheels/
30*.egg-info/
31.installed.cfg
32*.egg
33MANIFEST
34
35# PyInstaller
36# Usually these files are written by a python script from a template
37# before PyInstaller builds the exe, so as to inject date/other infos into it.
38*.manifest
39*.spec
40
41# Installer logs
42pip-log.txt
43pip-delete-this-directory.txt
44
45# Unit test / coverage reports
46htmlcov/
47.tox/
48.nox/
49.coverage
50.coverage.*
51.cache
52nosetests.xml
53coverage.xml
54*.cover
55*.py,cover
56.hypothesis/
57.pytest_cache/
58cover/
59
60# Translations
61*.mo
62*.pot
63
64# Django stuff:
65*.log
66local_settings.py
67db.sqlite3
68db.sqlite3-journal
69
70# Flask stuff:
71instance/
72.webassets-cache
73
74# Scrapy stuff:
75.scrapy
76
77# Sphinx documentation
78docs/_build/
79
80# PyBuilder
81.pybuilder/
82target/
83
84# Jupyter Notebook
85.ipynb_checkpoints
86
87# IPython
88profile_default/
89ipython_config.py
90
91# pyenv
92# For a library or package, you might want to ignore these files since the code is
93# intended to run in multiple environments; otherwise, check them in:
94.python-version
95
96# pdm
97# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
98#pdm.lock
99# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
100# in version control.
101# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
102.pdm.toml
103.pdm-python
104.pdm-build/
105
106# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
107__pypackages__/
108
109# Celery stuff
110celerybeat-schedule
111celerybeat.pid
112
113# SageMath parsed files
114*.sage.py
115
116# Environments
117.env
118.venv
119env/
120venv/
121ENV/
122env.bak/
123venv.bak/
124
125# Spyder project settings
126.spyderproject
127.spyproject
128
129# Rope project settings
130.ropeproject
131
132# mkdocs documentation
133/site
134
135# mypy
136.mypy_cache/
137.dmypy.json
138dmypy.json
139
140# Pyre type checker
141.pyre/
142
143# pytype static type analyzer
144.pytype/
145
146# Cython debug symbols
147cython_debug/
148
149# PyCharm
150# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
151# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152# and can be added to the global gitignore or merged into this file. For a more nuclear
153# option (not recommended) you can uncomment the following to ignore the entire idea folder.
154.idea/
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify >= 1.7.0
5selenium ~= 4.14.0
6webdriver_manager