watch-master-scrapper avatar
watch-master-scrapper
Try for free

No credit card required

View all Actors
watch-master-scrapper

watch-master-scrapper

yashvi_khunt/watch-master-scrapper
Try for free

No credit card required

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-1",
4    "title": "Getting started with Python and BeautifulSoup",
5    "description": "Scrapes titles of websites using BeautifulSoup.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-beautifulsoup"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "storages": {
13        "dataset": {
14            "actorSpecification": 1,
15            "title": "URLs and their titles",
16            "views": {
17                "titles": {
18                    "title": "URLs and their titles",
19                    "transformation": {
20                        "fields": [
21                            "url",
22                            "brand",
23                            "model",
24                            "referenceNo",
25                            "price",
26                            "deliveryTime",
27                            "condition",
28                            "dimensions",
29                            "gender",
30                            "case",
31                            "bracelet"
32                        ]
33                    },
34                    "display": {
35                        "component": "table",
36                        "properties": {
37                            "url": {
38                                "label": "URL",
39                                "format": "text"
40                            },
41                            "brand":{
42                                "label": "Brand",
43                                "format": "text"
44                            },
45                            "model": {
46                                "label": "Model",
47                                "format": "text"
48                            },
49                            "referenceNo": {
50                                "label": "Reference No.",
51                                "format": "text"
52                            },
53                            "price": {
54                                "label": "Price",
55                                "format": "text"
56                            },
57                            "deliveryTime": {
58                                "label": "Delivery Time",
59                                "format": "text"
60                            },
61                            "dimensions": {
62                                "label": "Dimensions",
63                                "format": "text"
64                            },
65                            "gender": {
66                                "label": "Gender",
67                                "format": "text"
68                            },
69                            "case": {
70                                "label": "Case",
71                                "format": "text"
72                            },
73                            "bracelet": {
74                                "label": "Bracelet",
75                                "format": "text"
76                            }
77                        }
78                    }
79                }
80            }
81        }
82    }
83}

.actor/input_schema.json

1{
2    "title": "Python BeautifulSoup Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "start_urls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with",
10            "prefill": [
11                { "url": "https://watchmaster.com/en/shop/rolex" }
12            ],
13            "editor": "requestListSources"
14        },
15        "max_depth": {
16            "title": "Maximum depth",
17            "type": "integer",
18            "description": "Depth to which to scrape to",
19            "default": 1
20        }
21    },
22    "required": ["start_urls"]
23}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from urllib.parse import urljoin
2from urllib.request import urlopen
3from bs4 import BeautifulSoup
4from apify import Actor
5
6
7async def main() -> None:
8    async with Actor:
9        actor_input = await Actor.get_input() or {}
10        start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
11        max_depth = actor_input.get('max_depth', 1)
12
13        if not start_urls:
14            Actor.log.info('No start URLs specified in actor input, exiting...')
15            await Actor.exit()
16
17        default_queue = await Actor.open_request_queue()
18        for start_url in start_urls:
19            url = start_url.get('url')
20            Actor.log.info(f'Enqueuing {url} ...')
21            await default_queue.add_request({'url': url, 'userData': {'depth': 0}})
22
23        while request := await default_queue.fetch_next_request():
24            url = request['url']
25            Actor.log.info(f'Scraping {url} ...')
26            depth = request['userData']['depth']
27
28            try:
29                with urlopen(url) as response:
30                    html = response.read()
31                    soup = BeautifulSoup(html, 'html.parser')
32
33                    if depth < max_depth:
34                        for card in soup.find_all('div', class_='ProductTile_container__FxMRh'):
35                            model = card.find('div', class_='ProductTile_model__o688p').get_text(strip=True)
36                            brand = card.find('div', class_='ProductTile_brand__lbN8y').get_text(strip=True)
37                            refno = card.find('div', class_='ProductTile_reference__cNaET').get_text(strip=True).split(':')[1]
38                            price = card.find('div', class_='ProductTile_price__8ctlV').get_text(strip=True)
39                            condition = card.find('div', class_='ProductTile_conditionContainer__zrUFy').find('div', class_="ProductTile_text___CSJT").get_text(strip=True)
40                            deliveryTime = card.find('div',class_='ProductTile_deliveryTime__sSBNw').find('div').next_sibling.strip()
41                            link = card.find('a')['href']
42
43                            detail_url = urljoin(url, link)
44                            with urlopen(detail_url) as detail_response:
45                                detail_html = detail_response.read()
46                                detail_soup = BeautifulSoup(detail_html, 'html.parser')
47
48                                product_details = detail_soup.find('div', class_='product-specifications-accordion')
49
50                                # Extract additional information
51                                dimensions = product_details.find('div', class_='specification__title', text='Dimensions').find_next_sibling('div').text.strip()
52                                gender = product_details.find('div', class_='specification__title', text='Gender').find_next_sibling('div').text.strip()
53                                case = product_details.find('div', class_='specification__title', text='Case').find_next_sibling('div').text.strip()
54                                bracelet = product_details.find('div', class_='specification__title', text='Bracelet').find_next_sibling('div').text.strip()
55
56                                # Append detail page data to existing data
57                                data = {
58                                    'url': detail_url,
59                                    'brand': brand,
60                                    'model': model,
61                                    'referenceNo': refno,
62                                    'price': price,
63                                    'deliveryTime': deliveryTime,
64                                    'condition': condition,
65                                    'dimensions': dimensions,
66                                    'gender': gender,
67                                    'case': case,
68                                    'bracelet': bracelet
69                                }
70
71
72                            await Actor.push_data(data)
73            except Exception:
74                Actor.log.exception(f'Cannot extract data from {url}.')
75            finally:
76                await default_queue.mark_request_as_handled(request)

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23.ruff_cache
24
25.scrapy
26*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7
Developer
Maintained by Community
Actor metrics
  • 3 monthly users
  • 1 star
  • 100.0% runs succeeded
  • Created in May 2024
  • Modified 3 months ago