watch-master-scrapper
Deprecated
Pricing
Pay per usage
Go to Store
watch-master-scrapper
Deprecated
0.0 (0)
Pricing
Pay per usage
1
Total users
6
Monthly users
3
Last modified
a year ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor-1", "title": "Getting started with Python and BeautifulSoup", "description": "Scrapes titles of websites using BeautifulSoup.", "version": "0.0", "meta": { "templateId": "python-beautifulsoup" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "URLs and their titles", "views": { "titles": { "title": "URLs and their titles", "transformation": { "fields": [ "url", "brand", "model", "referenceNo", "price", "deliveryTime", "condition", "dimensions", "gender", "case", "bracelet" ] }, "display": { "component": "table", "properties": { "url": { "label": "URL", "format": "text" }, "brand":{ "label": "Brand", "format": "text" }, "model": { "label": "Model", "format": "text" }, "referenceNo": { "label": "Reference No.", "format": "text" }, "price": { "label": "Price", "format": "text" }, "deliveryTime": { "label": "Delivery Time", "format": "text" }, "dimensions": { "label": "Dimensions", "format": "text" }, "gender": { "label": "Gender", "format": "text" }, "case": { "label": "Case", "format": "text" }, "bracelet": { "label": "Bracelet", "format": "text" } } } } } } }}
.actor/input_schema.json
{ "title": "Python BeautifulSoup Scraper", "type": "object", "schemaVersion": 1, "properties": { "start_urls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://watchmaster.com/en/shop/rolex" } ], "editor": "requestListSources" }, "max_depth": { "title": "Maximum depth", "type": "integer", "description": "Depth to which to scrape to", "default": 1 } }, "required": ["start_urls"]}
src/__main__.py
1"""2This module serves as the entry point for executing the Apify Actor. It handles the configuration of logging3settings. The `main()` coroutine is then executed using `asyncio.run()`.4
5Feel free to modify this file to suit your specific needs.6"""7
8import asyncio9import logging10
11from apify.log import ActorLogFormatter12
13from .main import main14
15# Configure loggers16handler = logging.StreamHandler()17handler.setFormatter(ActorLogFormatter())18
19apify_client_logger = logging.getLogger('apify_client')20apify_client_logger.setLevel(logging.INFO)21apify_client_logger.addHandler(handler)22
23apify_logger = logging.getLogger('apify')24apify_logger.setLevel(logging.DEBUG)25apify_logger.addHandler(handler)26
27# Execute the Actor main coroutine28asyncio.run(main())
src/main.py
1from urllib.parse import urljoin2from urllib.request import urlopen3from bs4 import BeautifulSoup4from apify import Actor5
6
7async def main() -> None:8 async with Actor:9 actor_input = await Actor.get_input() or {}10 start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])11 max_depth = actor_input.get('max_depth', 1)12
13 if not start_urls:14 Actor.log.info('No start URLs specified in actor input, exiting...')15 await Actor.exit()16
17 default_queue = await Actor.open_request_queue()18 for start_url in start_urls:19 url = start_url.get('url')20 Actor.log.info(f'Enqueuing {url} ...')21 await default_queue.add_request({'url': url, 'userData': {'depth': 0}})22
23 while request := await default_queue.fetch_next_request():24 url = request['url']25 Actor.log.info(f'Scraping {url} ...')26 depth = request['userData']['depth']27
28 try:29 with urlopen(url) as response:30 html = response.read()31 soup = BeautifulSoup(html, 'html.parser')32
33 if depth < max_depth:34 for card in soup.find_all('div', class_='ProductTile_container__FxMRh'):35 model = card.find('div', class_='ProductTile_model__o688p').get_text(strip=True)36 brand = card.find('div', class_='ProductTile_brand__lbN8y').get_text(strip=True)37 refno = card.find('div', class_='ProductTile_reference__cNaET').get_text(strip=True).split(':')[1]38 price = card.find('div', class_='ProductTile_price__8ctlV').get_text(strip=True)39 condition = card.find('div', class_='ProductTile_conditionContainer__zrUFy').find('div', class_="ProductTile_text___CSJT").get_text(strip=True)40 deliveryTime = card.find('div',class_='ProductTile_deliveryTime__sSBNw').find('div').next_sibling.strip()41 link = card.find('a')['href']42
43 detail_url = urljoin(url, link)44 with urlopen(detail_url) as detail_response:45 detail_html = detail_response.read()46 detail_soup = BeautifulSoup(detail_html, 'html.parser')47
48 product_details = detail_soup.find('div', class_='product-specifications-accordion')49
50 # Extract additional information51 dimensions = product_details.find('div', class_='specification__title', text='Dimensions').find_next_sibling('div').text.strip()52 gender = product_details.find('div', class_='specification__title', text='Gender').find_next_sibling('div').text.strip()53 case = product_details.find('div', class_='specification__title', text='Case').find_next_sibling('div').text.strip()54 bracelet = product_details.find('div', class_='specification__title', text='Bracelet').find_next_sibling('div').text.strip()55
56 # Append detail page data to existing data57 data = {58 'url': detail_url,59 'brand': brand,60 'model': model,61 'referenceNo': refno,62 'price': price,63 'deliveryTime': deliveryTime,64 'condition': condition,65 'dimensions': dimensions,66 'gender': gender,67 'case': case,68 'bracelet': bracelet69 }70
71
72 await Actor.push_data(data)73 except Exception:74 Actor.log.exception(f'Cannot extract data from {url}.')75 finally:76 await default_queue.mark_request_as_handled(request)
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed files.venv
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# This file tells Git which files shouldn't be added to source control
.idea.DS_Store
apify_storagestorage
.venv/.env/__pypackages__dist/build/*.egg-info/*.egg
__pycache__
.mypy_cache.dmypy.jsondmypy.json.pytest_cache.ruff_cache
.scrapy*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify ~= 1.7.05beautifulsoup4 ~= 4.12.26httpx ~= 0.25.27types-beautifulsoup4 ~= 4.12.0.7