Scrapy Books Example avatar
Scrapy Books Example

Pricing

Pay per usage

Go to Store
Scrapy Books Example

Scrapy Books Example

Developed by

Vlada Dusek

Vlada Dusek

Maintained by Community

Example of Python Scrapy project. It scrapes book data from https://books.toscrape.com/.

0.0 (0)

Pricing

Pay per usage

2

Total users

11

Monthly users

2

Runs succeeded

>99%

Last modified

9 days ago

Makefile

# This is used by the Github Actions to run the static analysis.
.PHONY: clean install-dev lint type-check format check-code
clean:
rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage
install-dev:
uv sync --all-extras --prerelease allow
lint:
uv run ruff format --check
uv run ruff check
type-check:
uv run mypy
format:
uv run ruff check --fix
uv run ruff format
check-code: lint type-check

pyproject.toml

1[project]
2name = "parsel-actor"
3version = "0.0.0" # not used by Apify
4description = "" # not used by Apify
5readme = "README.md"
6requires-python = ">=3.13"
7dependencies = [
8 "apify@git+https://github.com/apify/apify-sdk-python.git@new-apify-storage-clients",
9 "crawlee[parsel]==0.6.12b30",
10]
11
12[dependency-groups]
13dev = [
14 "mypy~=1.17.0",
15 "ruff~=0.12.0",
16]
17
18[tool.uv]
19prerelease = "allow"
20
21[tool.ruff]
22line-length = 120
23include = ["src/**/*.py"]
24
25[tool.ruff.lint.per-file-ignores]
26"**/__init__.py" = [
27 "F401", # Unused imports
28]
29"**/__main__.py" = [
30 "E402", # Module level import not at top of file
31]
32
33[tool.mypy]
34files = ["src"]
35
36[[tool.mypy.overrides]]
37module = [
38 'scrapy.*',
39]
40ignore_missing_imports = true

.actor/Dockerfile

FROM apify/actor-python:3.13
ENV ACTOR_DIR="src"
ENV UV_VERSION="0.7"
WORKDIR /usr/src/app
# Install Debian packages.
RUN apt-get update && \
apt-get install -yq --no-install-recommends \
git && \
rm -rf /var/lib/apt/lists/*
# Install uv package manager.
RUN pip install --upgrade pip && \
pip install uv~=${UV_VERSION}
# Copy python config files and install dependencies.
COPY pyproject.toml ./
RUN uv sync
COPY uv.lock ./
RUN uv export --no-hashes --no-dev | \
pip install --requirement /dev/stdin --no-dependencies
# Copy the source code of the Actor.
COPY .actor ./.actor
COPY ${ACTOR_DIR}/ ./${ACTOR_DIR}/
# Use compileall to ensure the runnability of the Actor Python code.
RUN uv run python -m compileall -q ${ACTOR_DIR}
# Specify how to launch the source code of your Actor.
CMD ["sh", "-c", "uv run python -m ${ACTOR_DIR}"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "scrapy-books-example",
"title": "Scrapy Books Example",
"description": "Example Actor scraping books using Scrapy and Apify SDK.",
"version": "0.0",
"meta": {
"templateId": "python-scrapy"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "scrapy-books-example",
"type": "object",
"schemaVersion": 1,
"properties": {
"proxyConfiguration": {
"sectionCaption": "Proxy and HTTP configuration",
"title": "Proxy configuration",
"type": "object",
"description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
"editor": "proxy",
"prefill": { "useApifyProxy": true },
"default": { "useApifyProxy": true }
}
},
"required": []
}

src/__init__.py

1

src/__main__.py

1from __future__ import annotations
2
3import asyncio
4from .main import main
5
6if __name__ == "__main__":
7 asyncio.run(main())

src/main.py

1from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
2
3from apify import Actor
4
5
6async def main() -> None:
7 async with Actor:
8 crawler = ParselCrawler(
9 max_crawl_depth=2,
10 )
11
12 @crawler.router.default_handler
13 async def request_handler(context: ParselCrawlingContext) -> None:
14 context.log.info(f'Processing {context.request.url} ...')
15
16 data = {
17 'url': context.request.url,
18 'title': context.selector.css('title::text').get(),
19 }
20
21 await context.push_data(data)
22 await context.enqueue_links(strategy='same-domain')
23
24 await crawler.run(['https://crawlee.dev/'])

src/py.typed