watch-master-scrapper avatar
watch-master-scrapper

Deprecated

Pricing

Pay per usage

Go to Store
watch-master-scrapper

watch-master-scrapper

Deprecated

Developed by

Yashvi Khunt

Yashvi Khunt

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

6

Monthly users

3

Last modified

a year ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "my-actor-1",
"title": "Getting started with Python and BeautifulSoup",
"description": "Scrapes titles of websites using BeautifulSoup.",
"version": "0.0",
"meta": {
"templateId": "python-beautifulsoup"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "URLs and their titles",
"views": {
"titles": {
"title": "URLs and their titles",
"transformation": {
"fields": [
"url",
"brand",
"model",
"referenceNo",
"price",
"deliveryTime",
"condition",
"dimensions",
"gender",
"case",
"bracelet"
]
},
"display": {
"component": "table",
"properties": {
"url": {
"label": "URL",
"format": "text"
},
"brand":{
"label": "Brand",
"format": "text"
},
"model": {
"label": "Model",
"format": "text"
},
"referenceNo": {
"label": "Reference No.",
"format": "text"
},
"price": {
"label": "Price",
"format": "text"
},
"deliveryTime": {
"label": "Delivery Time",
"format": "text"
},
"dimensions": {
"label": "Dimensions",
"format": "text"
},
"gender": {
"label": "Gender",
"format": "text"
},
"case": {
"label": "Case",
"format": "text"
},
"bracelet": {
"label": "Bracelet",
"format": "text"
}
}
}
}
}
}
}
}

.actor/input_schema.json

{
"title": "Python BeautifulSoup Scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"start_urls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with",
"prefill": [
{ "url": "https://watchmaster.com/en/shop/rolex" }
],
"editor": "requestListSources"
},
"max_depth": {
"title": "Maximum depth",
"type": "integer",
"description": "Depth to which to scrape to",
"default": 1
}
},
"required": ["start_urls"]
}

src/__main__.py

1"""
2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging
3settings. The `main()` coroutine is then executed using `asyncio.run()`.
4
5Feel free to modify this file to suit your specific needs.
6"""
7
8import asyncio
9import logging
10
11from apify.log import ActorLogFormatter
12
13from .main import main
14
15# Configure loggers
16handler = logging.StreamHandler()
17handler.setFormatter(ActorLogFormatter())
18
19apify_client_logger = logging.getLogger('apify_client')
20apify_client_logger.setLevel(logging.INFO)
21apify_client_logger.addHandler(handler)
22
23apify_logger = logging.getLogger('apify')
24apify_logger.setLevel(logging.DEBUG)
25apify_logger.addHandler(handler)
26
27# Execute the Actor main coroutine
28asyncio.run(main())

src/main.py

1from urllib.parse import urljoin
2from urllib.request import urlopen
3from bs4 import BeautifulSoup
4from apify import Actor
5
6
7async def main() -> None:
8 async with Actor:
9 actor_input = await Actor.get_input() or {}
10 start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
11 max_depth = actor_input.get('max_depth', 1)
12
13 if not start_urls:
14 Actor.log.info('No start URLs specified in actor input, exiting...')
15 await Actor.exit()
16
17 default_queue = await Actor.open_request_queue()
18 for start_url in start_urls:
19 url = start_url.get('url')
20 Actor.log.info(f'Enqueuing {url} ...')
21 await default_queue.add_request({'url': url, 'userData': {'depth': 0}})
22
23 while request := await default_queue.fetch_next_request():
24 url = request['url']
25 Actor.log.info(f'Scraping {url} ...')
26 depth = request['userData']['depth']
27
28 try:
29 with urlopen(url) as response:
30 html = response.read()
31 soup = BeautifulSoup(html, 'html.parser')
32
33 if depth < max_depth:
34 for card in soup.find_all('div', class_='ProductTile_container__FxMRh'):
35 model = card.find('div', class_='ProductTile_model__o688p').get_text(strip=True)
36 brand = card.find('div', class_='ProductTile_brand__lbN8y').get_text(strip=True)
37 refno = card.find('div', class_='ProductTile_reference__cNaET').get_text(strip=True).split(':')[1]
38 price = card.find('div', class_='ProductTile_price__8ctlV').get_text(strip=True)
39 condition = card.find('div', class_='ProductTile_conditionContainer__zrUFy').find('div', class_="ProductTile_text___CSJT").get_text(strip=True)
40 deliveryTime = card.find('div',class_='ProductTile_deliveryTime__sSBNw').find('div').next_sibling.strip()
41 link = card.find('a')['href']
42
43 detail_url = urljoin(url, link)
44 with urlopen(detail_url) as detail_response:
45 detail_html = detail_response.read()
46 detail_soup = BeautifulSoup(detail_html, 'html.parser')
47
48 product_details = detail_soup.find('div', class_='product-specifications-accordion')
49
50 # Extract additional information
51 dimensions = product_details.find('div', class_='specification__title', text='Dimensions').find_next_sibling('div').text.strip()
52 gender = product_details.find('div', class_='specification__title', text='Gender').find_next_sibling('div').text.strip()
53 case = product_details.find('div', class_='specification__title', text='Case').find_next_sibling('div').text.strip()
54 bracelet = product_details.find('div', class_='specification__title', text='Bracelet').find_next_sibling('div').text.strip()
55
56 # Append detail page data to existing data
57 data = {
58 'url': detail_url,
59 'brand': brand,
60 'model': model,
61 'referenceNo': refno,
62 'price': price,
63 'deliveryTime': deliveryTime,
64 'condition': condition,
65 'dimensions': dimensions,
66 'gender': gender,
67 'case': case,
68 'bracelet': bracelet
69 }
70
71
72 await Actor.push_data(data)
73 except Exception:
74 Actor.log.exception(f'Cannot extract data from {url}.')
75 finally:
76 await default_queue.mark_request_as_handled(request)

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
.venv
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
.DS_Store
apify_storage
storage
.venv/
.env/
__pypackages__
dist/
build/
*.egg-info/
*.egg
__pycache__
.mypy_cache
.dmypy.json
dmypy.json
.pytest_cache
.ruff_cache
.scrapy
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify ~= 1.7.0
5beautifulsoup4 ~= 4.12.2
6httpx ~= 0.25.2
7types-beautifulsoup4 ~= 4.12.0.7