watch-master-scrapper avatar

watch-master-scrapper

Under maintenance
Try for free

No credit card required

Go to Store
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
watch-master-scrapper

watch-master-scrapper

yashvi_khunt/watch-master-scrapper
Try for free

No credit card required

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-1",
4    "title": "Getting started with Python and BeautifulSoup",
5    "description": "Scrapes titles of websites using BeautifulSoup.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-beautifulsoup"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "storages": {
13        "dataset": {
14            "actorSpecification": 1,
15            "title": "URLs and their titles",
16            "views": {
17                "titles": {
18                    "title": "URLs and their titles",
19                    "transformation": {
20                        "fields": [
21                            "url",
22                            "brand",
23                            "model",
24                            "referenceNo",
25                            "price",
26                            "deliveryTime",
27                            "condition",
28                            "dimensions",
29                            "gender",
30                            "case",
31                            "bracelet"
32                        ]
33                    },
34                    "display": {
35                        "component": "table",
36                        "properties": {
37                            "url": {
38                                "label": "URL",
39                                "format": "text"
40                            },
41                            "brand":{
42                                "label": "Brand",
43                                "format": "text"
44                            },
45                            "model": {
46                                "label": "Model",
47                                "format": "text"
48                            },
49                            "referenceNo": {
50                                "label": "Reference No.",
51                                "format": "text"
52                            },
53                            "price": {
54                                "label": "Price",
55                                "format": "text"
56                            },
57                            "deliveryTime": {
58                                "label": "Delivery Time",
59                                "format": "text"
60                            },
61                            "dimensions": {
62                                "label": "Dimensions",
63                                "format": "text"
64                            },
65                            "gender": {
66                                "label": "Gender",
67                                "format": "text"
68                            },
69                            "case": {
70                                "label": "Case",
71                                "format": "text"
72                            },
73                            "bracelet": {
74                                "label": "Bracelet",
75                                "format": "text"
76                            }
77                        }
78                    }
79                }
80            }
81        }
82    }
83}

.actor/input_schema.json

1{
2    "title": "Python BeautifulSoup Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "start_urls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with",
10            "prefill": [
11                { "url": "https://watchmaster.com/en/shop/rolex" }
12            ],
13            "editor": "requestListSources"
14        },
15        "max_depth": {
16            "title": "Maximum depth",
17            "type": "integer",
18            "description": "Depth to which to scrape to",
19            "default": 1
20        }
21    },
22    "required": ["start_urls"]
23}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from urllib.parse import urljoin
2from urllib.request import urlopen
3from bs4 import BeautifulSoup
4from apify import Actor
5
6
7async def main() -> None:
8    async with Actor:
9        actor_input = await Actor.get_input() or {}
10        start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
11        max_depth = actor_input.get('max_depth', 1)
12
13        if not start_urls:
14            Actor.log.info('No start URLs specified in actor input, exiting...')
15            await Actor.exit()
16
17        default_queue = await Actor.open_request_queue()
18        for start_url in start_urls:
19            url = start_url.get('url')
20            Actor.log.info(f'Enqueuing {url} ...')
21            await default_queue.add_request({'url': url, 'userData': {'depth': 0}})
22
23        while request := await default_queue.fetch_next_request():
24            url = request['url']
25            Actor.log.info(f'Scraping {url} ...')
26            depth = request['userData']['depth']
27
28            try:
29                with urlopen(url) as response:
30                    html = response.read()
31                    soup = BeautifulSoup(html, 'html.parser')
32
33                    if depth < max_depth:
34                        for card in soup.find_all('div', class_='ProductTile_container__FxMRh'):
35                            model = card.find('div', class_='ProductTile_model__o688p').get_text(strip=True)
36                            brand = card.find('div', class_='ProductTile_brand__lbN8y').get_text(strip=True)
37                            refno = card.find('div', class_='ProductTile_reference__cNaET').get_text(strip=True).split(':')[1]
38                            price = card.find('div', class_='ProductTile_price__8ctlV').get_text(strip=True)
39                            condition = card.find('div', class_='ProductTile_conditionContainer__zrUFy').find('div', class_="ProductTile_text___CSJT").get_text(strip=True)
40                            deliveryTime = card.find('div',class_='ProductTile_deliveryTime__sSBNw').find('div').next_sibling.strip()
41                            link = card.find('a')['href']
42
43                            detail_url = urljoin(url, link)
44                            with urlopen(detail_url) as detail_response:
45                                detail_html = detail_response.read()
46                                detail_soup = BeautifulSoup(detail_html, 'html.parser')
47
48                                product_details = detail_soup.find('div', class_='product-specifications-accordion')
49
50                                # Extract additional information
51                                dimensions = product_details.find('div', class_='specification__title', text='Dimensions').find_next_sibling('div').text.strip()
52                                gender = product_details.find('div', class_='specification__title', text='Gender').find_next_sibling('div').text.strip()
53                                case = product_details.find('div', class_='specification__title', text='Case').find_next_sibling('div').text.strip()
54                                bracelet = product_details.find('div', class_='specification__title', text='Bracelet').find_next_sibling('div').text.strip()
55
56                                # Append detail page data to existing data
57                                data = {
58                                    'url': detail_url,
59                                    'brand': brand,
60                                    'model': model,
61                                    'referenceNo': refno,
62                                    'price': price,
63                                    'deliveryTime': deliveryTime,
64                                    'condition': condition,
65                                    'dimensions': dimensions,
66                                    'gender': gender,
67                                    'case': case,
68                                    'bracelet': bracelet
69                                }
70
71
72                            await Actor.push_data(data)
73            except Exception:
74                Actor.log.exception(f'Cannot extract data from {url}.')
75            finally:
76                await default_queue.mark_request_as_handled(request)

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10.venv
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.DS_Store
5
6apify_storage
7storage
8
9.venv/
10.env/
11__pypackages__
12dist/
13build/
14*.egg-info/
15*.egg
16
17__pycache__
18
19.mypy_cache
20.dmypy.json
21dmypy.json
22.pytest_cache
23.ruff_cache
24
25.scrapy
26*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7
Developer
Maintained by Community

Actor Metrics

  • 3 monthly users

  • 1 star

  • >99% runs succeeded

  • Created in May 2024

  • Modified 7 months ago

Categories