Pricing

Pay per usage

Go to Apify Store

My Actor

Try for free

Developed by

bandi

0.0 (0)

Pricing

Pay per usage

Last modified

a year ago

News

Open source

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python-selenium:3.11

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor",
    "title": "Getting started with Python and Selenium",
    "description": "Scrapes titles of websites using Selenium.",
    "version": "0.0",
    "meta": {
        "templateId": "python-selenium"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile",
    "storages": {
        "dataset": {
            "actorSpecification": 1,
            "title": "URLs and their titles",
            "views": {
                "titles": {
                    "title": "URLs and their titles",
                    "transformation": {
                        "fields": [
                            "url",
                            "title"
                        ]
                    },
                    "display": {
                        "component": "table",
                        "properties": {
                            "url": {
                                "label": "URL",
                                "format": "text"
                            },
                            "title": {
                                "label": "Title",
                                "format": "text"
                            }
                        }
                    }
                }
            }
        }
    }
}

.actor/input_schema.json

{
    "title": "Python Selenium Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "start_urls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://apify.com" }
            ],
            "editor": "requestListSources"
        },
        "max_depth": {
            "title": "Maximum depth",
            "type": "integer",
            "description": "Depth to which to scrape to",
            "default": 1
        }
    },
    "required": ["start_urls"]
}

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
.venv

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control

.idea
.DS_Store

apify_storage
storage

.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg

__pycache__

.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache

.scrapy
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5selenium ~= 4.14.0

src/main.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1"""
2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file.
3
4Feel free to modify this file to suit your specific needs.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10from urllib.parse import urljoin
11
12from selenium import webdriver
13from selenium.webdriver.chrome.options import Options as ChromeOptions
14from selenium.webdriver.common.by import By
15
16from apify import Actor
17
18# To run this Actor locally, you need to have the Selenium Chromedriver installed.
19# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
20# When running on the Apify platform, it is already included in the Actor's Docker image.
21
22
23async def main() -> None:
24    """
25    The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function
26    out of it, it will not work. Asynchronous execution is required for communication with Apify platform,
27    and it also enhances performance in the field of web scraping significantly.
28    """
29    async with Actor:
30        # Read the Actor input
31        actor_input = await Actor.get_input() or {}
32        start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
33        max_depth = actor_input.get('max_depth', 1)
34
35        if not start_urls:
36            Actor.log.info('No start URLs specified in actor input, exiting...')
37            await Actor.exit()
38
39        # Enqueue the starting URLs in the default request queue
40        default_queue = await Actor.open_request_queue()
41        for start_url in start_urls:
42            url = start_url.get('url')
43            Actor.log.info(f'Enqueuing {url} ...')
44            await default_queue.add_request({'url': url, 'userData': {'depth': 0}})
45
46        # Launch a new Selenium Chrome WebDriver
47        Actor.log.info('Launching Chrome WebDriver...')
48        chrome_options = ChromeOptions()
49        if Actor.config.headless:
50            chrome_options.add_argument('--headless')
51        chrome_options.add_argument('--no-sandbox')
52        chrome_options.add_argument('--disable-dev-shm-usage')
53        driver = webdriver.Chrome(options=chrome_options)
54
55        driver.get('http://www.example.com')
56        assert driver.title == 'Example Domain'
57
58        # Process the requests in the queue one by one
59        while request := await default_queue.fetch_next_request():
60            url = request['url']
61            depth = request['userData']['depth']
62            Actor.log.info(f'Scraping {url} ...')
63
64            try:
65                # Open the URL in the Selenium WebDriver
66                driver.get(url)
67
68                # If we haven't reached the max depth,
69                # look for nested links and enqueue their targets
70                if depth < max_depth:
71                    for link in driver.find_elements(By.TAG_NAME, 'a'):
72                        link_href = link.get_attribute('href')
73                        link_url = urljoin(url, link_href)
74                        if link_url.startswith(('http://', 'https://')):
75                            Actor.log.info(f'Enqueuing {link_url} ...')
76                            await default_queue.add_request({
77                                'url': link_url,
78                                'userData': {'depth': depth + 1},
79                            })
80
81                # Push the title of the page into the default dataset
82                title = driver.title
83                await Actor.push_data({'url': url, 'title': title})
84            except Exception:
85                Actor.log.exception(f'Cannot extract data from {url}.')
86            finally:
87                await default_queue.mark_request_as_handled(request)
88
89        driver.quit()

My Actor

nmymy/my-actor

Seokaos Melih

My Actor

kevin_p/my-actor

Kevin Perez

My Actor

lurid_quadrille/my-actor

Hari Hari

My Actor

xenon3034/my-actor

Xenon

My Actor

flow_matic/my-actor

Flow Matic

My Actor

dev_fusion/my-actor

Dev Fusion

My Actor

hulajas20/my-actor

对对对

My Actor

tigerbkk5789/my-actor

Thakorn Jirakul

My Actor

zany_guardian/my-actor-1

saksham

My Actor

bigtreemin/my-actor

Min Zhou

# First, specify the base Docker image. # You can see the Docker images from Apify at https://hub.docker.com/r/apify/. # You can also use any other image from Docker Hub. FROM apify/actor-python-selenium:3.11 # Second, copy just requirements.txt into the Actor image, # since it should be the only file that affects the dependency install in the next step, # in order to speed up the build COPY requirements.txt ./ # Install the packages specified in requirements.txt, # Print the installed Python version, pip version # and all installed packages with their versions for debugging RUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze # Next, copy the remaining files and directories with the source code. # Since we do this after installing the dependencies, quick build will be really fast # for most source file changes. COPY . ./ # Use compileall to ensure the runnability of the Actor Python code. RUN python3 -m compileall -q . # Specify how to launch the source code of your Actor. # By default, the "python3 -m src" command is run CMD ["python3", "-m", "src"]

{ "actorSpecification": 1, "name": "my-actor", "title": "Getting started with Python and Selenium", "description": "Scrapes titles of websites using Selenium.", "version": "0.0", "meta": { "templateId": "python-selenium" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "URLs and their titles", "views": { "titles": { "title": "URLs and their titles", "transformation": { "fields": [ "url", "title" ] }, "display": { "component": "table", "properties": { "url": { "label": "URL", "format": "text" }, "title": { "label": "Title", "format": "text" } } } } } } } }

{ "title": "Python Selenium Scraper", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources" }, "max_depth": { "title": "Maximum depth", "type": "integer", "description": "Depth to which to scrape to", "default": 1 } }, "required": ["start_urls"] }

# This file tells Git which files shouldn't be added to source control .idea .DS_Store apify_storage storage .venv/ .env/ __pypackages__ dist/ build/ *.egg-info/ *.egg __pycache__ .mypy_cache .dmypy.json dmypy.json .pytest_cache .ruff_cache .scrapy *.log

1""" 2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging 3settings. The `main()` coroutine is then executed using `asyncio.run()`. 4 5Feel free to modify this file to suit your specific needs. 6""" 7 8import asyncio 9import logging 10 11from apify.log import ActorLogFormatter 12 13from .main import main 14 15# Configure loggers 16handler = logging.StreamHandler() 17handler.setFormatter(ActorLogFormatter()) 18 19apify_client_logger = logging.getLogger('apify_client') 20apify_client_logger.setLevel(logging.INFO) 21apify_client_logger.addHandler(handler) 22 23apify_logger = logging.getLogger('apify') 24apify_logger.setLevel(logging.DEBUG) 25apify_logger.addHandler(handler) 26 27# Execute the Actor main coroutine 28asyncio.run(main())

1""" 2This module defines the `main()` coroutine for the Apify Actor, executed from the `__main__.py` file. 3 4Feel free to modify this file to suit your specific needs. 5 6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation: 7https://docs.apify.com/sdk/python 8""" 9 10from urllib.parse import urljoin 11 12from selenium import webdriver 13from selenium.webdriver.chrome.options import Options as ChromeOptions 14from selenium.webdriver.common.by import By 15 16from apify import Actor 17 18# To run this Actor locally, you need to have the Selenium Chromedriver installed. 19# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/ 20# When running on the Apify platform, it is already included in the Actor's Docker image. 21 22 23async def main() -> None: 24 """ 25 The main coroutine is being executed using `asyncio.run()`, so do not attempt to make a normal function 26 out of it, it will not work. Asynchronous execution is required for communication with Apify platform, 27 and it also enhances performance in the field of web scraping significantly. 28 """ 29 async with Actor: 30 # Read the Actor input 31 actor_input = await Actor.get_input() or {} 32 start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) 33 max_depth = actor_input.get('max_depth', 1) 34 35 if not start_urls: 36 Actor.log.info('No start URLs specified in actor input, exiting...') 37 await Actor.exit() 38 39 # Enqueue the starting URLs in the default request queue 40 default_queue = await Actor.open_request_queue() 41 for start_url in start_urls: 42 url = start_url.get('url') 43 Actor.log.info(f'Enqueuing {url} ...') 44 await default_queue.add_request({'url': url, 'userData': {'depth': 0}}) 45 46 # Launch a new Selenium Chrome WebDriver 47 Actor.log.info('Launching Chrome WebDriver...') 48 chrome_options = ChromeOptions() 49 if Actor.config.headless: 50 chrome_options.add_argument('--headless') 51 chrome_options.add_argument('--no-sandbox') 52 chrome_options.add_argument('--disable-dev-shm-usage') 53 driver = webdriver.Chrome(options=chrome_options) 54 55 driver.get('http://www.example.com') 56 assert driver.title == 'Example Domain' 57 58 # Process the requests in the queue one by one 59 while request := await default_queue.fetch_next_request(): 60 url = request['url'] 61 depth = request['userData']['depth'] 62 Actor.log.info(f'Scraping {url} ...') 63 64 try: 65 # Open the URL in the Selenium WebDriver 66 driver.get(url) 67 68 # If we haven't reached the max depth, 69 # look for nested links and enqueue their targets 70 if depth < max_depth: 71 for link in driver.find_elements(By.TAG_NAME, 'a'): 72 link_href = link.get_attribute('href') 73 link_url = urljoin(url, link_href) 74 if link_url.startswith(('http://', 'https://')): 75 Actor.log.info(f'Enqueuing {link_url} ...') 76 await default_queue.add_request({ 77 'url': link_url, 78 'userData': {'depth': depth + 1}, 79 }) 80 81 # Push the title of the page into the default dataset 82 title = driver.title 83 await Actor.push_data({'url': url, 'title': title}) 84 except Exception: 85 Actor.log.exception(f'Cannot extract data from {url}.') 86 finally: 87 await default_queue.mark_request_as_handled(request) 88 89 driver.quit()

My Actor

My Actor

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

.dockerignore

.editorconfig

.gitignore

requirements.txt

src/__main__.py

src/main.py

You might also like

My Actor

My Actor

My Actor

My Actor

My Actor

My Actor

My Actor

My Actor

My Actor

My Actor

.actor/Dockerfile

.actor/actor.json

.actor/input_schema.json

.dockerignore

.editorconfig

.gitignore

requirements.txt

src/__main__.py

src/main.py

src/main.py

src/main.py