ScraperCodeGenerator avatar
ScraperCodeGenerator

Pricing

Pay per usage

Go to Store
ScraperCodeGenerator

ScraperCodeGenerator

Developed by

Ondřej Hlava

Ondřej Hlava

Maintained by Community

An intelligent web scraping tool that automatically generates custom scraping code for any website.

0.0 (0)

Pricing

Pay per usage

0

Total users

1

Monthly users

1

Last modified

a day ago

.gitignore

# --- General ---
.DS_Store
.env
.env.*
# --- Logs ---
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
# --- IDEs ---
.vscode/*
!.vscode/extensions.json
.idea/
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
# --- Python ---
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
.venv/
venv/
ENV/
env/
.env
.env.*
.pytest_cache/
.coverage
htmlcov/
.tox/
.cache
.mypy_cache/
.dmypy.json
dmypy.json
# Project specific
scraped_results/
*.html
# --- Node.js / Frontend ---
frontend/node_modules/
frontend/dist/
frontend/dist-ssr/
frontend/.pnp
frontend/.pnp.js
frontend/.npm
node_modules
dist
dist-ssr
*.local
# Added by Apify CLI
storage
.venv
# --- Apify ---
storage/
apify_storage/
# --- Local test files ---
input.json
test_*

.python-version

3.10

Dockerfile

# Use the official Apify Python base image
FROM apify/actor-python:3.11
# Copy requirements and install dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
# Copy the source code
COPY . ./
# Set the entrypoint
CMD ["python", "main.py"]

cli.py

1#!/usr/bin/env python3
2"""
3CLI Entry Point for LLMScraper
4"""
5
6import sys
7import os
8
9# Add src to path for development
10sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
11
12from llmscraper.main import main
13
14if __name__ == "__main__":
15 sys.exit(main())

main.py

1"""
2Apify Actor entry point for LLMScraper.
3"""
4
5import asyncio
6import os
7import sys
8
9# Add src to path
10sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
11
12from apify import Actor
13from llmscraper import run_intelligent_scraper
14
15
16async def main():
17 """Main actor function."""
18 async with Actor:
19 Actor.log.info("DEBUG: Inside Actor context")
20 Actor.log.info("🚀 LLMScraper Actor starting...")
21
22 # Get input
23 actor_input = await Actor.get_input() or {}
24
25 # Extract parameters
26 target_url = actor_input.get('targetUrl')
27 user_goal = actor_input.get('userGoal')
28 claude_api_key = actor_input.get('claudeApiKey')
29
30 # Use actor token for Apify operations
31 apify_token = Actor.config.token
32
33 if not target_url:
34 raise ValueError("targetUrl is required")
35
36 if not user_goal:
37 raise ValueError("userGoal is required")
38
39 if not claude_api_key:
40 raise ValueError("claudeApiKey is required")
41
42 Actor.log.info(f"Target URL: {target_url}")
43 Actor.log.info(f"User Goal: {user_goal}")
44
45 Actor.log.info("DEBUG: About to call run_intelligent_scraper")
46
47 try:
48 # Run the intelligent scraper
49 Actor.log.info("DEBUG: About to call run_intelligent_scraper with parameters:")
50 Actor.log.info(f" - target_url: {target_url}")
51 Actor.log.info(f" - user_goal: {user_goal}")
52 Actor.log.info(f" - has claude_api_key: {bool(claude_api_key)}")
53 Actor.log.info(f" - has apify_token: {bool(apify_token)}")
54
55 results = await run_intelligent_scraper(
56 target_url=target_url,
57 user_goal=user_goal,
58 claude_api_key=claude_api_key,
59 apify_token=apify_token,
60 for_actor=True, # Set for actor mode
61 actor_logger=Actor.log, # Pass Actor logger
62 test_script=True # Enable testing in actor mode
63 )
64
65 Actor.log.info("DEBUG: run_intelligent_scraper completed")
66 Actor.log.info(f"DEBUG: Results type: {type(results)}")
67 Actor.log.info(f"DEBUG: Results success: {getattr(results, 'success', 'No success attr')}")
68
69 if hasattr(results, 'error_message') and results.error_message:
70 Actor.log.error(f"DEBUG: Results error: {results.error_message}")
71
72 if hasattr(results, 'quality_scores') and results.quality_scores:
73 Actor.log.info(f"DEBUG: Quality scores: {results.quality_scores}")
74
75 if hasattr(results, 'best_actor') and results.best_actor:
76 Actor.log.info(f"DEBUG: Best actor: {results.best_actor}")
77
78 if hasattr(results, 'generated_script') and results.generated_script:
79 Actor.log.info(f"DEBUG: Generated script length: {len(results.generated_script)} characters")
80
81 if hasattr(results, 'extracted_data') and results.extracted_data:
82 Actor.log.info(f"DEBUG: Test extraction successful: {len(results.extracted_data) if isinstance(results.extracted_data, list) else 1} items")
83
84 if results.success:
85 Actor.log.info(f"Scraping completed successfully!")
86
87 # Show the generated script in actor mode
88 if hasattr(results, 'generated_script') and results.generated_script:
89 Actor.log.info("\\n" + "="*60)
90 Actor.log.info("📄 GENERATED ACTOR SCRIPT")
91 Actor.log.info("="*60)
92 Actor.log.info(results.generated_script)
93 Actor.log.info("="*60)
94
95 # Create output data
96 output_data = {
97 "url": target_url,
98 "title": getattr(results, 'title', 'N/A'),
99 "data": results.extracted_data if hasattr(results, 'extracted_data') else {},
100 "generated_script": getattr(results, 'generated_script', ''),
101 "best_actor": getattr(results, 'best_actor', ''),
102 "quality_scores": getattr(results, 'quality_scores', {}),
103 "timestamp": "2025-06-29T20:00:00Z", # You might want to use actual timestamp
104 "success": True,
105 "error": None
106 }
107
108 # Push results to default dataset
109 await Actor.push_data([output_data])
110 else:
111 Actor.log.error(f"Scraping failed: {results.error_message}")
112
113 # Still push error data for tracking
114 error_data = {
115 "url": target_url,
116 "title": "N/A",
117 "data": {},
118 "timestamp": "2025-06-29T20:00:00Z",
119 "success": False,
120 "error": results.error_message
121 }
122
123 await Actor.push_data([error_data])
124
125 Actor.log.info("✅ LLMScraper Actor finished successfully!")
126
127 except Exception as e:
128 Actor.log.error(f"❌ Error during scraping: {str(e)}")
129 raise
130
131
132if __name__ == "__main__":
133 asyncio.run(main())

package.json

{
"name": "scraper-code-generator",
"version": "0.1.0",
"description": "Intelligent web scraping framework using AI-powered quality evaluation",
"main": "main.py",
"scripts": {
"start": "python main.py"
},
"dependencies": {},
"author": "",
"license": "MIT"
}

pyproject.toml

1[build-system]
2requires = ["hatchling"]
3build-backend = "hatchling.build"
4
5[project]
6name = "llmscraper"
7version = "0.1.0"
8description = "Intelligent web scraping framework using AI-powered quality evaluation and multiple scraping strategies"
9readme = "README.md"
10requires-python = ">=3.10"
11dependencies = [
12 "anthropic>=0.54.0",
13 "apify-client>=1.11.0",
14 "beautifulsoup4>=4.12.0",
15 "apify>=1.5.0",
16]
17
18[project.scripts]
19llmscraper = "llmscraper.main:main"
20
21[project.optional-dependencies]
22dev = [
23 "pytest>=7.0",
24 "pytest-asyncio>=0.21.0",
25 "black>=23.0",
26 "isort>=5.12.0",
27 "mypy>=1.5.0",
28]
29
30[tool.hatch.build.targets.wheel]
31packages = ["src/llmscraper"]
32
33[tool.hatch.build.targets.sdist]
34include = [
35 "/src",
36 "/tests",
37 "/examples.json",
38 "/README.md",
39]

requirements.txt

1apify>=1.5.0
2apify-client>=1.5.0
3anthropic>=0.7.0
4beautifulsoup4>=4.12.0

uv.lock

version = 1
revision = 2
requires-python = ">=3.10"
[[package]]
name = "annotated-types"
version = "0.7.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
]
[[package]]
name = "anthropic"
version = "0.54.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
{ name = "distro" },
{ name = "httpx" },
{ name = "jiter" },
{ name = "pydantic" },
{ name = "sniffio" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/89/28/80cb9bb6e7ce77d404145b51da4257455805c17f0a6be528ff3286e3882f/anthropic-0.54.0.tar.gz", hash = "sha256:5e6f997d97ce8e70eac603c3ec2e7f23addeff953fbbb76b19430562bb6ba815", size = 312376, upload-time = "2025-06-11T02:46:27.642Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/de/b9/6ffb48e82c5e97b03cecee872d134a6b6666c2767b2d32ed709f3a60a8fe/anthropic-0.54.0-py3-none-any.whl", hash = "sha256:c1062a0a905daeec17ca9c06c401e4b3f24cb0495841d29d752568a1d4018d56", size = 288774, upload-time = "2025-06-11T02:46:25.578Z" },
]
[[package]]
name = "anyio"
version = "4.9.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "exceptiongroup", marker = "python_full_version < '3.11'" },
{ name = "idna" },
{ name = "sniffio" },
{ name = "typing-extensions", marker = "python_full_version < '3.13'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" },
]
[[package]]
name = "apify-client"
version = "1.11.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "apify-shared" },
{ name = "colorama" },
{ name = "httpx" },
{ name = "more-itertools" },
]
sdist = { url = "https://files.pythonhosted.org/packages/49/44/b7cae857f2129d4093bc5a0a2267fcbba7905207a0b7cc424dc3c7c90291/apify_client-1.11.0.tar.gz", hash = "sha256:c2e151754c35be9bc7c1028bf7cb127aeb1ffa2fbd1ec1ad7e97b901deb32e08", size = 346095, upload-time = "2025-06-13T11:46:39.129Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8f/24/d3273bfe5b4a96fd60c8d554edbab99274fae8cb2347b96f2e3fa0bc4d5b/apify_client-1.11.0-py3-none-any.whl", hash = "sha256:9d691960bdbeee17624a2a82aafc4f0bfba9b48820a48f559b7eba76bf01cb3c", size = 82550, upload-time = "2025-06-13T11:46:37.483Z" },
]
[[package]]
name = "apify-shared"
version = "1.4.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b2/a6/c8e2fa0b3bdc479d3ecde778e2381af199f910cf7c8baa3c207bcfe26e47/apify_shared-1.4.1.tar.gz", hash = "sha256:16e617c840fd27bf38d980f079c0b867c7378f68c7006b3d5a7d530d43930507", size = 13871, upload-time = "2025-04-28T12:20:01.113Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1d/f3/3446c8a7986fdc087024d4e174e4b3f587097a9b28f6f8e8c788199225b2/apify_shared-1.4.1-py3-none-any.whl", hash = "sha256:abac5712b6e8eb96693204cbb2702905e1971d9084b1716e7337852b5005290e", size = 12706, upload-time = "2025-04-28T12:19:59.792Z" },
]
[[package]]
name = "certifi"
version = "2025.6.15"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" },
]
[[package]]
name = "colorama"
version = "0.4.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
[[package]]
name = "distro"
version = "1.9.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
]
[[package]]
name = "exceptiongroup"
version = "1.3.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions", marker = "python_full_version < '3.13'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },
]
[[package]]
name = "h11"
version = "0.16.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
]
[[package]]
name = "httpcore"
version = "1.0.9"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "h11" },
]
sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
]
[[package]]
name = "httpx"
version = "0.28.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
{ name = "certifi" },
{ name = "httpcore" },
{ name = "idna" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]
[[package]]
name = "idna"
version = "3.10"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
]
[[package]]
name = "jiter"
version = "0.10.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/be/7e/4011b5c77bec97cb2b572f566220364e3e21b51c48c5bd9c4a9c26b41b67/jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303", size = 317215, upload-time = "2025-05-18T19:03:04.303Z" },
{ url = "https://files.pythonhosted.org/packages/8a/4f/144c1b57c39692efc7ea7d8e247acf28e47d0912800b34d0ad815f6b2824/jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e", size = 322814, upload-time = "2025-05-18T19:03:06.433Z" },
{ url = "https://files.pythonhosted.org/packages/63/1f/db977336d332a9406c0b1f0b82be6f71f72526a806cbb2281baf201d38e3/jiter-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8b3e0068c26ddedc7abc6fac37da2d0af16b921e288a5a613f4b86f050354f", size = 345237, upload-time = "2025-05-18T19:03:07.833Z" },
{ url = "https://files.pythonhosted.org/packages/d7/1c/aa30a4a775e8a672ad7f21532bdbfb269f0706b39c6ff14e1f86bdd9e5ff/jiter-0.10.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:286299b74cc49e25cd42eea19b72aa82c515d2f2ee12d11392c56d8701f52224", size = 370999, upload-time = "2025-05-18T19:03:09.338Z" },
{ url = "https://files.pythonhosted.org/packages/35/df/f8257abc4207830cb18880781b5f5b716bad5b2a22fb4330cfd357407c5b/jiter-0.10.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ed5649ceeaeffc28d87fb012d25a4cd356dcd53eff5acff1f0466b831dda2a7", size = 491109, upload-time = "2025-05-18T19:03:11.13Z" },
{ url = "https://files.pythonhosted.org/packages/06/76/9e1516fd7b4278aa13a2cc7f159e56befbea9aa65c71586305e7afa8b0b3/jiter-0.10.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2ab0051160cb758a70716448908ef14ad476c3774bd03ddce075f3c1f90a3d6", size = 388608, upload-time = "2025-05-18T19:03:12.911Z" },
{ url = "https://files.pythonhosted.org/packages/6d/64/67750672b4354ca20ca18d3d1ccf2c62a072e8a2d452ac3cf8ced73571ef/jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03997d2f37f6b67d2f5c475da4412be584e1cec273c1cfc03d642c46db43f8cf", size = 352454, upload-time = "2025-05-18T19:03:14.741Z" },
{ url = "https://files.pythonhosted.org/packages/96/4d/5c4e36d48f169a54b53a305114be3efa2bbffd33b648cd1478a688f639c1/jiter-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c404a99352d839fed80d6afd6c1d66071f3bacaaa5c4268983fc10f769112e90", size = 391833, upload-time = "2025-05-18T19:03:16.426Z" },
{ url = "https://files.pythonhosted.org/packages/0b/de/ce4a6166a78810bd83763d2fa13f85f73cbd3743a325469a4a9289af6dae/jiter-0.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66e989410b6666d3ddb27a74c7e50d0829704ede652fd4c858e91f8d64b403d0", size = 523646, upload-time = "2025-05-18T19:03:17.704Z" },
{ url = "https://files.pythonhosted.org/packages/a2/a6/3bc9acce53466972964cf4ad85efecb94f9244539ab6da1107f7aed82934/jiter-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b532d3af9ef4f6374609a3bcb5e05a1951d3bf6190dc6b176fdb277c9bbf15ee", size = 514735, upload-time = "2025-05-18T19:03:19.44Z" },
{ url = "https://files.pythonhosted.org/packages/b4/d8/243c2ab8426a2a4dea85ba2a2ba43df379ccece2145320dfd4799b9633c5/jiter-0.10.0-cp310-cp310-win32.whl", hash = "sha256:da9be20b333970e28b72edc4dff63d4fec3398e05770fb3205f7fb460eb48dd4", size = 210747, upload-time = "2025-05-18T19:03:21.184Z" },
{ url = "https://files.pythonhosted.org/packages/37/7a/8021bd615ef7788b98fc76ff533eaac846322c170e93cbffa01979197a45/jiter-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:f59e533afed0c5b0ac3eba20d2548c4a550336d8282ee69eb07b37ea526ee4e5", size = 207484, upload-time = "2025-05-18T19:03:23.046Z" },
{ url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" },
{ url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" },
{ url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" },
{ url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" },
{ url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" },
{ url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" },
{ url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" },
{ url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" },
{ url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" },
{ url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" },
{ url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" },
{ url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" },
{ url = "https://files.pythonhosted.org/packages/6d/b5/348b3313c58f5fbfb2194eb4d07e46a35748ba6e5b3b3046143f3040bafa/jiter-0.10.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1e274728e4a5345a6dde2d343c8da018b9d4bd4350f5a472fa91f66fda44911b", size = 312262, upload-time = "2025-05-18T19:03:44.637Z" },
{ url = "https://files.pythonhosted.org/packages/9c/4a/6a2397096162b21645162825f058d1709a02965606e537e3304b02742e9b/jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7202ae396446c988cb2a5feb33a543ab2165b786ac97f53b59aafb803fef0744", size = 320124, upload-time = "2025-05-18T19:03:46.341Z" },
{ url = "https://files.pythonhosted.org/packages/2a/85/1ce02cade7516b726dd88f59a4ee46914bf79d1676d1228ef2002ed2f1c9/jiter-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23ba7722d6748b6920ed02a8f1726fb4b33e0fd2f3f621816a8b486c66410ab2", size = 345330, upload-time = "2025-05-18T19:03:47.596Z" },
{ url = "https://files.pythonhosted.org/packages/75/d0/bb6b4f209a77190ce10ea8d7e50bf3725fc16d3372d0a9f11985a2b23eff/jiter-0.10.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:371eab43c0a288537d30e1f0b193bc4eca90439fc08a022dd83e5e07500ed026", size = 369670, upload-time = "2025-05-18T19:03:49.334Z" },
{ url = "https://files.pythonhosted.org/packages/a0/f5/a61787da9b8847a601e6827fbc42ecb12be2c925ced3252c8ffcb56afcaf/jiter-0.10.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c675736059020365cebc845a820214765162728b51ab1e03a1b7b3abb70f74c", size = 489057, upload-time = "2025-05-18T19:03:50.66Z" },
{ url = "https://files.pythonhosted.org/packages/12/e4/6f906272810a7b21406c760a53aadbe52e99ee070fc5c0cb191e316de30b/jiter-0.10.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c5867d40ab716e4684858e4887489685968a47e3ba222e44cde6e4a2154f959", size = 389372, upload-time = "2025-05-18T19:03:51.98Z" },
{ url = "https://files.pythonhosted.org/packages/e2/ba/77013b0b8ba904bf3762f11e0129b8928bff7f978a81838dfcc958ad5728/jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395bb9a26111b60141757d874d27fdea01b17e8fac958b91c20128ba8f4acc8a", size = 352038, upload-time = "2025-05-18T19:03:53.703Z" },
{ url = "https://files.pythonhosted.org/packages/67/27/c62568e3ccb03368dbcc44a1ef3a423cb86778a4389e995125d3d1aaa0a4/jiter-0.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6842184aed5cdb07e0c7e20e5bdcfafe33515ee1741a6835353bb45fe5d1bd95", size = 391538, upload-time = "2025-05-18T19:03:55.046Z" },
{ url = "https://files.pythonhosted.org/packages/c0/72/0d6b7e31fc17a8fdce76164884edef0698ba556b8eb0af9546ae1a06b91d/jiter-0.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62755d1bcea9876770d4df713d82606c8c1a3dca88ff39046b85a048566d56ea", size = 523557, upload-time = "2025-05-18T19:03:56.386Z" },
{ url = "https://files.pythonhosted.org/packages/2f/09/bc1661fbbcbeb6244bd2904ff3a06f340aa77a2b94e5a7373fd165960ea3/jiter-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:533efbce2cacec78d5ba73a41756beff8431dfa1694b6346ce7af3a12c42202b", size = 514202, upload-time = "2025-05-18T19:03:57.675Z" },
{ url = "https://files.pythonhosted.org/packages/1b/84/5a5d5400e9d4d54b8004c9673bbe4403928a00d28529ff35b19e9d176b19/jiter-0.10.0-cp312-cp312-win32.whl", hash = "sha256:8be921f0cadd245e981b964dfbcd6fd4bc4e254cdc069490416dd7a2632ecc01", size = 211781, upload-time = "2025-05-18T19:03:59.025Z" },
{ url = "https://files.pythonhosted.org/packages/9b/52/7ec47455e26f2d6e5f2ea4951a0652c06e5b995c291f723973ae9e724a65/jiter-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7c7d785ae9dda68c2678532a5a1581347e9c15362ae9f6e68f3fdbfb64f2e49", size = 206176, upload-time = "2025-05-18T19:04:00.305Z" },
{ url = "https://files.pythonhosted.org/packages/2e/b0/279597e7a270e8d22623fea6c5d4eeac328e7d95c236ed51a2b884c54f70/jiter-0.10.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0588107ec8e11b6f5ef0e0d656fb2803ac6cf94a96b2b9fc675c0e3ab5e8644", size = 311617, upload-time = "2025-05-18T19:04:02.078Z" },
{ url = "https://files.pythonhosted.org/packages/91/e3/0916334936f356d605f54cc164af4060e3e7094364add445a3bc79335d46/jiter-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cafc4628b616dc32530c20ee53d71589816cf385dd9449633e910d596b1f5c8a", size = 318947, upload-time = "2025-05-18T19:04:03.347Z" },
{ url = "https://files.pythonhosted.org/packages/6a/8e/fd94e8c02d0e94539b7d669a7ebbd2776e51f329bb2c84d4385e8063a2ad/jiter-0.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520ef6d981172693786a49ff5b09eda72a42e539f14788124a07530f785c3ad6", size = 344618, upload-time = "2025-05-18T19:04:04.709Z" },
{ url = "https://files.pythonhosted.org/packages/6f/b0/f9f0a2ec42c6e9c2e61c327824687f1e2415b767e1089c1d9135f43816bd/jiter-0.10.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:554dedfd05937f8fc45d17ebdf298fe7e0c77458232bcb73d9fbbf4c6455f5b3", size = 368829, upload-time = "2025-05-18T19:04:06.912Z" },
{ url = "https://files.pythonhosted.org/packages/e8/57/5bbcd5331910595ad53b9fd0c610392ac68692176f05ae48d6ce5c852967/jiter-0.10.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bc299da7789deacf95f64052d97f75c16d4fc8c4c214a22bf8d859a4288a1c2", size = 491034, upload-time = "2025-05-18T19:04:08.222Z" },
{ url = "https://files.pythonhosted.org/packages/9b/be/c393df00e6e6e9e623a73551774449f2f23b6ec6a502a3297aeeece2c65a/jiter-0.10.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5161e201172de298a8a1baad95eb85db4fb90e902353b1f6a41d64ea64644e25", size = 388529, upload-time = "2025-05-18T19:04:09.566Z" },
{ url = "https://files.pythonhosted.org/packages/42/3e/df2235c54d365434c7f150b986a6e35f41ebdc2f95acea3036d99613025d/jiter-0.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2227db6ba93cb3e2bf67c87e594adde0609f146344e8207e8730364db27041", size = 350671, upload-time = "2025-05-18T19:04:10.98Z" },
{ url = "https://files.pythonhosted.org/packages/c6/77/71b0b24cbcc28f55ab4dbfe029f9a5b73aeadaba677843fc6dc9ed2b1d0a/jiter-0.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:15acb267ea5e2c64515574b06a8bf393fbfee6a50eb1673614aa45f4613c0cca", size = 390864, upload-time = "2025-05-18T19:04:12.722Z" },
{ url = "https://files.pythonhosted.org/packages/6a/d3/ef774b6969b9b6178e1d1e7a89a3bd37d241f3d3ec5f8deb37bbd203714a/jiter-0.10.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:901b92f2e2947dc6dfcb52fd624453862e16665ea909a08398dde19c0731b7f4", size = 522989, upload-time = "2025-05-18T19:04:14.261Z" },
{ url = "https://files.pythonhosted.org/packages/0c/41/9becdb1d8dd5d854142f45a9d71949ed7e87a8e312b0bede2de849388cb9/jiter-0.10.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d0cb9a125d5a3ec971a094a845eadde2db0de85b33c9f13eb94a0c63d463879e", size = 513495, upload-time = "2025-05-18T19:04:15.603Z" },
{ url = "https://files.pythonhosted.org/packages/9c/36/3468e5a18238bdedae7c4d19461265b5e9b8e288d3f86cd89d00cbb48686/jiter-0.10.0-cp313-cp313-win32.whl", hash = "sha256:48a403277ad1ee208fb930bdf91745e4d2d6e47253eedc96e2559d1e6527006d", size = 211289, upload-time = "2025-05-18T19:04:17.541Z" },
{ url = "https://files.pythonhosted.org/packages/7e/07/1c96b623128bcb913706e294adb5f768fb7baf8db5e1338ce7b4ee8c78ef/jiter-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:75f9eb72ecb640619c29bf714e78c9c46c9c4eaafd644bf78577ede459f330d4", size = 205074, upload-time = "2025-05-18T19:04:19.21Z" },
{ url = "https://files.pythonhosted.org/packages/54/46/caa2c1342655f57d8f0f2519774c6d67132205909c65e9aa8255e1d7b4f4/jiter-0.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:28ed2a4c05a1f32ef0e1d24c2611330219fed727dae01789f4a335617634b1ca", size = 318225, upload-time = "2025-05-18T19:04:20.583Z" },
{ url = "https://files.pythonhosted.org/packages/43/84/c7d44c75767e18946219ba2d703a5a32ab37b0bc21886a97bc6062e4da42/jiter-0.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a4c418b1ec86a195f1ca69da8b23e8926c752b685af665ce30777233dfe070", size = 350235, upload-time = "2025-05-18T19:04:22.363Z" },
{ url = "https://files.pythonhosted.org/packages/01/16/f5a0135ccd968b480daad0e6ab34b0c7c5ba3bc447e5088152696140dcb3/jiter-0.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d7bfed2fe1fe0e4dda6ef682cee888ba444b21e7a6553e03252e4feb6cf0adca", size = 207278, upload-time = "2025-05-18T19:04:23.627Z" },
{ url = "https://files.pythonhosted.org/packages/1c/9b/1d646da42c3de6c2188fdaa15bce8ecb22b635904fc68be025e21249ba44/jiter-0.10.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:5e9251a5e83fab8d87799d3e1a46cb4b7f2919b895c6f4483629ed2446f66522", size = 310866, upload-time = "2025-05-18T19:04:24.891Z" },
{ url = "https://files.pythonhosted.org/packages/ad/0e/26538b158e8a7c7987e94e7aeb2999e2e82b1f9d2e1f6e9874ddf71ebda0/jiter-0.10.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:023aa0204126fe5b87ccbcd75c8a0d0261b9abdbbf46d55e7ae9f8e22424eeb8", size = 318772, upload-time = "2025-05-18T19:04:26.161Z" },
{ url = "https://files.pythonhosted.org/packages/7b/fb/d302893151caa1c2636d6574d213e4b34e31fd077af6050a9c5cbb42f6fb/jiter-0.10.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c189c4f1779c05f75fc17c0c1267594ed918996a231593a21a5ca5438445216", size = 344534, upload-time = "2025-05-18T19:04:27.495Z" },
{ url = "https://files.pythonhosted.org/packages/01/d8/5780b64a149d74e347c5128d82176eb1e3241b1391ac07935693466d6219/jiter-0.10.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15720084d90d1098ca0229352607cd68256c76991f6b374af96f36920eae13c4", size = 369087, upload-time = "2025-05-18T19:04:28.896Z" },
{ url = "https://files.pythonhosted.org/packages/e8/5b/f235a1437445160e777544f3ade57544daf96ba7e96c1a5b24a6f7ac7004/jiter-0.10.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4f2fb68e5f1cfee30e2b2a09549a00683e0fde4c6a2ab88c94072fc33cb7426", size = 490694, upload-time = "2025-05-18T19:04:30.183Z" },
{ url = "https://files.pythonhosted.org/packages/85/a9/9c3d4617caa2ff89cf61b41e83820c27ebb3f7b5fae8a72901e8cd6ff9be/jiter-0.10.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce541693355fc6da424c08b7edf39a2895f58d6ea17d92cc2b168d20907dee12", size = 388992, upload-time = "2025-05-18T19:04:32.028Z" },
{ url = "https://files.pythonhosted.org/packages/68/b1/344fd14049ba5c94526540af7eb661871f9c54d5f5601ff41a959b9a0bbd/jiter-0.10.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c50c40272e189d50006ad5c73883caabb73d4e9748a688b216e85a9a9ca3b9", size = 351723, upload-time = "2025-05-18T19:04:33.467Z" },
{ url = "https://files.pythonhosted.org/packages/41/89/4c0e345041186f82a31aee7b9d4219a910df672b9fef26f129f0cda07a29/jiter-0.10.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fa3402a2ff9815960e0372a47b75c76979d74402448509ccd49a275fa983ef8a", size = 392215, upload-time = "2025-05-18T19:04:34.827Z" },
{ url = "https://files.pythonhosted.org/packages/55/58/ee607863e18d3f895feb802154a2177d7e823a7103f000df182e0f718b38/jiter-0.10.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:1956f934dca32d7bb647ea21d06d93ca40868b505c228556d3373cbd255ce853", size = 522762, upload-time = "2025-05-18T19:04:36.19Z" },
{ url = "https://files.pythonhosted.org/packages/15/d0/9123fb41825490d16929e73c212de9a42913d68324a8ce3c8476cae7ac9d/jiter-0.10.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86", size = 513427, upload-time = "2025-05-18T19:04:37.544Z" },
{ url = "https://files.pythonhosted.org/packages/d8/b3/2bd02071c5a2430d0b70403a34411fc519c2f227da7b03da9ba6a956f931/jiter-0.10.0-cp314-cp314-win32.whl", hash = "sha256:ac509f7eccca54b2a29daeb516fb95b6f0bd0d0d8084efaf8ed5dfc7b9f0b357", size = 210127, upload-time = "2025-05-18T19:04:38.837Z" },
{ url = "https://files.pythonhosted.org/packages/03/0c/5fe86614ea050c3ecd728ab4035534387cd41e7c1855ef6c031f1ca93e3f/jiter-0.10.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ed975b83a2b8639356151cef5c0d597c68376fc4922b45d0eb384ac058cfa00", size = 318527, upload-time = "2025-05-18T19:04:40.612Z" },
{ url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },
]
[[package]]
name = "llmscraper"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "anthropic" },
{ name = "apify-client" },
]
[package.metadata]
requires-dist = [
{ name = "anthropic", specifier = ">=0.54.0" },
{ name = "apify-client", specifier = ">=1.11.0" },
]
[[package]]
name = "more-itertools"
version = "10.7.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" },
]
[[package]]
name = "pydantic"
version = "2.11.7"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "annotated-types" },
{ name = "pydantic-core" },
{ name = "typing-extensions" },
{ name = "typing-inspection" },
]
sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },
]
[[package]]
name = "pydantic-core"
version = "2.33.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" },
{ url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" },
{ url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" },
{ url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" },
{ url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" },
{ url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" },
{ url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" },
{ url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" },
{ url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" },
{ url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" },
{ url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" },
{ url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" },
{ url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" },
{ url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" },
{ url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" },
{ url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" },
{ url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" },
{ url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" },
{ url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" },
{ url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" },
{ url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" },
{ url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" },
{ url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" },
{ url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" },
{ url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" },
{ url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" },
{ url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" },
{ url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" },
{ url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" },
{ url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" },
{ url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" },
{ url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" },
{ url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" },
{ url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" },
{ url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" },
{ url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" },
{ url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" },
{ url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" },
{ url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" },
{ url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" },
{ url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" },
{ url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" },
{ url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" },
{ url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" },
{ url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" },
{ url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" },
{ url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" },
{ url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" },
{ url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" },
{ url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" },
{ url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" },
{ url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" },
{ url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" },
{ url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" },
{ url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" },
{ url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" },
{ url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" },
{ url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" },
{ url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" },
{ url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" },
{ url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" },
{ url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" },
{ url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" },
{ url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" },
{ url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" },
{ url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" },
{ url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" },
{ url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" },
{ url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" },
{ url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" },
{ url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" },
{ url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" },
{ url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" },
{ url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" },
{ url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" },
{ url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
]
[[package]]
name = "sniffio"
version = "1.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
]
[[package]]
name = "typing-extensions"
version = "4.14.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d1/bc/51647cd02527e87d05cb083ccc402f93e441606ff1f01739a62c8ad09ba5/typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4", size = 107423, upload-time = "2025-06-02T14:52:11.399Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af", size = 43839, upload-time = "2025-06-02T14:52:10.026Z" },
]
[[package]]
name = "typing-inspection"
version = "0.4.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },
]

.actor/README.md

1# ScraperCodeGenerator Actor
2
3An intelligent web scraping actor that uses AI-powered quality evaluation and multiple scraping strategies to extract data from websites.
4
5## Features
6
7- **AI-Powered Quality Evaluation**: Uses Claude AI to evaluate the quality of extracted data
8- **Multiple Scraping Strategies**: Employs different scraping techniques for optimal results
9- **Intelligent Pipeline**: Automatically selects the best scraping approach based on the website structure
10
11## Input
12
13The actor expects the following input parameters:
14
15- **Target URL** (required): The URL of the website to scrape
16- **User Goal** (required): Description of what data you want to extract (e.g., "Extract product names, prices, and descriptions")
17- **Claude API Key** (required): Your Anthropic Claude API key for AI-powered evaluation
18
19## Output
20
21The actor returns extracted data in a structured format based on your specified goal. Results are automatically saved to the default dataset.
22
23## Usage
24
251. Provide the target URL you want to scrape
262. Describe what data you want to extract in the "User Goal" field
273. Enter your Claude API key
284. Run the actor
29
30The actor will intelligently analyze the website, extract the requested data, and evaluate the quality of the results using AI.
31
32## Example Input
33
34```json
35{
36 "targetUrl": "https://example-store.com/products",
37 "userGoal": "Extract product names, prices, and availability status",
38 "claudeApiKey": "sk-ant-..."
39}
40```
41
42## Requirements
43
44- Valid Claude API key from Anthropic
45- Target website must be accessible
46- Sufficient Apify compute units for the scraping operation

.actor/actor.json

{
"actorSpecification": 1,
"name": "ScraperCodeGenerator",
"version": "0.1",
"buildTag": "latest",
"environmentVariables": {
"CLAUDE_API_KEY": "@claudeApiKey"
},
"dockerfile": "./Dockerfile",
"input": {
"title": "Intelligent Scraper Settings",
"description": "AI-powered web scraper that generates custom scraping code based on your requirements",
"type": "object",
"schemaVersion": 1,
"properties": {
"targetUrl": {
"title": "Target URL",
"description": "The URL of the website you want to scrape",
"type": "string",
"editor": "textfield",
"prefill": "https://books.toscrape.com/"
},
"userGoal": {
"title": "Scraping Goal",
"description": "Describe what data you want to extract from the website",
"type": "string",
"editor": "textarea",
"prefill": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock."
},
"claudeApiKey": {
"title": "Claude API Key",
"description": "Your Anthropic Claude API key for AI-powered code generation",
"type": "string",
"editor": "textfield",
"isSecret": true
},
"maxRetries": {
"title": "Max Retries",
"description": "Maximum number of retry attempts for scraping",
"type": "integer",
"minimum": 1,
"maximum": 10,
"default": 3
},
"timeout": {
"title": "Timeout (seconds)",
"description": "Timeout for each scraping attempt in seconds",
"type": "integer",
"minimum": 10,
"maximum": 300,
"default": 60
}
},
"required": ["targetUrl", "userGoal", "claudeApiKey"]
},
"storages": {
"dataset": {
"actorSpecification": 1,
"views": {
"scraped_data": {
"title": "Scraped Data",
"transformation": {},
"display": {
"component": "table",
"properties": {
"url": {
"label": "Source URL",
"format": "link"
},
"title": {
"label": "Page Title",
"format": "text"
},
"data": {
"label": "Extracted Data",
"format": "object"
},
"timestamp": {
"label": "Scraped At",
"format": "datetime"
},
"success": {
"label": "Success",
"format": "boolean"
},
"error": {
"label": "Error Message",
"format": "text"
}
}
}
}
}
}
}
}

quality_eval/claude_client.py

1

quality_eval/html_quality_evaluator.py

1"""
2HTML Quality Evaluator Module
3
4This module provides functionality to evaluate the quality of HTML documents
5fetched by automated browsers. It determines if the HTML is suitable for data
6extraction or if it represents a failure page (e.g., CAPTCHA, block page, etc.).
7
8Uses Claude AI to perform intelligent analysis based on the user's extraction goals.
9"""
10
11import json
12import logging
13import re
14from typing import Dict, Any, Optional
15from dataclasses import dataclass
16
17import anthropic
18
19
20@dataclass
21class EvaluationResult:
22 """Result of HTML quality evaluation."""
23 score: int # 1-10 scale
24 reasoning: str
25
26
27@dataclass
28class PreEvaluationResult:
29 """Result of pre-evaluation checks before sending to Claude."""
30 is_valid_html: bool
31 score: Optional[int] = None # If we can determine score without Claude
32 reasoning: Optional[str] = None
33 should_continue_to_claude: bool = True
34
35
36class HTMLQualityEvaluator:
37 """
38 Evaluates HTML quality for web scraping using Claude AI.
39
40 This class provides methods to analyze HTML content and determine
41 if it's suitable for data extraction based on the user's goals.
42 """
43
44 def __init__(self, claude_api_key: str):
45 """
46 Initialize the HTML Quality Evaluator.
47
48 Args:
49 claude_api_key: Anthropic API key for Claude access
50 """
51 if not claude_api_key or not claude_api_key.strip():
52 raise ValueError("claude_api_key cannot be empty")
53
54 self.client = anthropic.Anthropic(api_key=claude_api_key)
55 self.logger = logging.getLogger(__name__)
56
57 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:
58 """
59 Perform basic validation checks on HTML content before sending to Claude.
60
61 This method checks for obvious issues that don't require AI analysis:
62 - Whether content is actually HTML
63 - Empty or whitespace-only content
64
65 Args:
66 html_content: The HTML content to pre-evaluate
67
68 Returns:
69 PreEvaluationResult indicating if validation passed and whether to continue
70 """
71 if not html_content or not html_content.strip():
72 return PreEvaluationResult(
73 is_valid_html=False,
74 score=1,
75 reasoning="Content is empty or contains only whitespace",
76 should_continue_to_claude=False
77 )
78
79 content = html_content.strip()
80 content_lower = content.lower()
81
82 # Check if content looks like HTML at all
83 has_opening_closing_tags = '<' in content and '>' in content
84
85 if not has_opening_closing_tags:
86 return PreEvaluationResult(
87 is_valid_html=False,
88 score=1,
89 reasoning="Content does not contain HTML tags",
90 should_continue_to_claude=False
91 )
92
93 # Check for obvious non-HTML content
94 if content.startswith('{') and content.endswith('}'):
95 # Looks like JSON
96 return PreEvaluationResult(
97 is_valid_html=False,
98 score=1,
99 reasoning="Content appears to be JSON, not HTML",
100 should_continue_to_claude=False
101 )
102
103 if content.startswith('<?xml'):
104 # XML but not HTML
105 return PreEvaluationResult(
106 is_valid_html=False,
107 score=1,
108 reasoning="Content appears to be XML, not HTML",
109 should_continue_to_claude=False
110 )
111
112 # If it looks like valid HTML, let Claude evaluate it
113 return PreEvaluationResult(
114 is_valid_html=True,
115 should_continue_to_claude=True
116 )
117
118 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:
119 """
120 Create the prompt for Claude to evaluate HTML quality.
121
122 Args:
123 user_goal: The user's data extraction goal
124 html_content: The HTML content to evaluate
125
126 Returns:
127 Formatted prompt for Claude
128 """
129 return f"""You are a QA automation engineer specializing in web scraping. Your task is to evaluate the quality of an HTML document that was fetched by an automated browser. The goal is to determine if the HTML is suitable for data extraction or if it looks like a failure page (e.g., a CAPTCHA, block page, or empty loading page).
130
131You will be given the user's original data extraction goal and the full HTML content from a scraping attempt.
132
133REQUIREMENTS:
134- Analysis: Analyze the provided HTML for common indicators of success or failure.
135- Success Indicators: Meaningful content in the <body>, presence of <h1>-<h6> tags, lists (<ul>, <ol>), tables (<table>), structured <div>s with descriptive class names.
136- Failure Indicators: Common keywords like "CAPTCHA", "bot detection", "enable JavaScript", "access denied", "rate limit". An empty or near-empty <body> tag. A structure that looks like a loading spinner.
137- Output Format: Your response MUST be a single, well-formed JSON object with the following two keys:
138 "score": An integer from 1 (unusable) to 10 (perfectly rendered and content-rich).
139 "reasoning": A brief, one-sentence explanation for your score.
140- Contextual Awareness: Use the "User's Goal" to inform your analysis. If the user wants "product listings" and you see a grid of items, that's a high score. If you see a login form, that's a low score.
141
142[INPUT 1] User's Goal:
143{user_goal}
144
145[INPUT 2] HTML Content from an Actor Run:
146{html_content}
147
148Please analyze the HTML and provide your evaluation as a JSON object."""
149
150 def _parse_claude_response(self, response_text: str) -> Optional[EvaluationResult]:
151 """
152 Parse Claude's response and extract the evaluation result.
153
154 Args:
155 response_text: Raw response from Claude
156
157 Returns:
158 EvaluationResult if parsing successful, None otherwise
159 """
160 try:
161 # Try to find JSON in the response
162 json_match = re.search(r'\{[^}]*"score"[^}]*\}', response_text, re.DOTALL)
163 if not json_match:
164 self.logger.error("No JSON object found in Claude's response")
165 return None
166
167 json_str = json_match.group(0)
168 result_dict = json.loads(json_str)
169
170 # Validate required fields
171 if 'score' not in result_dict or 'reasoning' not in result_dict:
172 self.logger.error("Missing required fields in Claude's response")
173 return None
174
175 score = int(result_dict['score'])
176 if not (1 <= score <= 10):
177 self.logger.error(f"Score {score} is outside valid range 1-10")
178 return None
179
180 return EvaluationResult(
181 score=score,
182 reasoning=result_dict['reasoning']
183 )
184
185 except (json.JSONDecodeError, ValueError, KeyError) as e:
186 self.logger.error(f"Failed to parse Claude's response: {e}")
187 return None
188
189 def _preprocess_html(self, html_content: str) -> str:
190 """
191 Preprocess HTML content for analysis.
192
193 Args:
194 html_content: Raw HTML content
195
196 Returns:
197 Preprocessed HTML content
198 """
199 # Truncate very long HTML to avoid token limits
200 max_length = 100000 # Adjust based on Claude's token limits
201 if len(html_content) > max_length:
202 self.logger.warning(f"HTML content truncated from {len(html_content)} to {max_length} characters")
203 html_content = html_content[:max_length] + "\n<!-- Content truncated for analysis -->"
204
205 return html_content
206
207 def get_pre_evaluation_info(self, html_content: str) -> PreEvaluationResult:
208 """
209 Get detailed pre-evaluation information without performing the full evaluation.
210
211 This method allows users to see what the pre-evaluation checks detected
212 without calling Claude if they just want to understand the basic validation.
213
214 Args:
215 html_content: The HTML content to pre-evaluate
216
217 Returns:
218 PreEvaluationResult with detailed validation information
219 """
220 return self._pre_evaluate_html(html_content)
221
222 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
223 """
224 Evaluate the quality of HTML content for data extraction.
225
226 This is the main function that analyzes HTML content to determine if it's
227 suitable for data extraction based on the user's goals.
228
229 The evaluation process consists of two stages:
230 1. Pre-evaluation: Basic validation checks for HTML structure, error pages,
231 and obvious content issues that don't require AI analysis
232 2. Claude AI analysis: Detailed content evaluation using AI if pre-evaluation passes
233
234 Args:
235 user_goal: The user's data extraction goal/objective
236 html_content: The HTML content to evaluate
237
238 Returns:
239 EvaluationResult containing score (1-10) and reasoning, or None if evaluation failed
240
241 Example:
242 >>> evaluator = HTMLQualityEvaluator("your-claude-api-key")
243 >>> result = evaluator.evaluate_html_quality(
244 ... "I want to get a list of all articles on the homepage",
245 ... "<html><body><article>...</article></body></html>"
246 ... )
247 >>> if result:
248 ... print(f"Score: {result.score}, Reasoning: {result.reasoning}")
249 """
250 if not user_goal or not user_goal.strip():
251 raise ValueError("user_goal cannot be empty")
252
253 # Perform pre-evaluation checks on the HTML content
254 self.logger.info("Starting pre-evaluation checks on HTML content")
255 pre_eval_result = self._pre_evaluate_html(html_content)
256
257 if not pre_eval_result.should_continue_to_claude:
258 self.logger.info(f"Pre-evaluation completed without Claude - Score: {pre_eval_result.score}")
259 return EvaluationResult(
260 score=pre_eval_result.score,
261 reasoning=pre_eval_result.reasoning
262 )
263
264 # Log pre-evaluation results
265 if not pre_eval_result.is_valid_html:
266 self.logger.warning("Pre-evaluation detected non-HTML content, but continuing to Claude")
267 else:
268 self.logger.info("Pre-evaluation passed, proceeding to Claude analysis")
269
270 try:
271 # Preprocess HTML content
272 processed_html = self._preprocess_html(html_content)
273
274 # Create the evaluation prompt
275 prompt = self._create_evaluation_prompt(user_goal, processed_html)
276
277 self.logger.info("Sending HTML evaluation request to Claude")
278
279 # Send request to Claude
280 response = self.client.messages.create(
281 model="claude-3-5-sonnet-20241022", # TODO use 4
282 max_tokens=1000,
283 messages=[
284 {
285 "role": "user",
286 "content": prompt
287 }
288 ]
289 )
290
291 # Extract and parse the response
292 response_text = response.content[0].text
293 result = self._parse_claude_response(response_text)
294
295 if result:
296 self.logger.info(f"HTML evaluation completed - Score: {result.score}")
297 else:
298 self.logger.error("Failed to parse evaluation result from Claude")
299
300 return result
301
302 except anthropic.APIError as e:
303 self.logger.error(f"Claude API error: {e}")
304 return None
305 except Exception as e:
306 self.logger.error(f"Unexpected error during HTML evaluation: {e}", exc_info=True)
307 return None

manual_scraping/apify_runner.py

1"""
2Apify Actor Runner Module
3
4This module provides functionality to run Apify actors and retrieve their results
5using the official Apify Python client library.
6"""
7
8import logging
9from typing import Optional, Dict, Any, List, Union
10
11from apify_client import ApifyClient
12
13
14def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:
15 """
16 Run an Apify actor and retrieve its dataset results.
17
18 This function is maintained for backward compatibility. For more flexible
19 data retrieval (including key-value stores), use run_apify_actor_with_flexible_retrieval.
20
21 Args:
22 actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')
23 actor_input: Dictionary containing the input configuration for the actor
24 api_token: Apify API token for authentication (keyword-only argument)
25
26 Returns:
27 List of dictionaries containing the dataset items if successful, None otherwise.
28 Returns an empty list if the run succeeds but produces no data.
29
30 Raises:
31 ValueError: If actor_id or api_token is empty
32
33 Example:
34 >>> input_data = {"startUrls": [{"url": "https://example.com"}]}
35 >>> results = run_apify_actor("apify/web-scraper", input_data, api_token="your_token")
36 >>> if results is not None:
37 ... print(f"Scraped {len(results)} items")
38 """
39 result = run_apify_actor_with_flexible_retrieval(
40 actor_id=actor_id,
41 actor_input=actor_input,
42 api_token=api_token,
43 retrieve_from="dataset"
44 )
45
46 # Ensure we return a list for backward compatibility
47 if isinstance(result, list):
48 return result
49 else:
50 return None
51
52
53def run_apify_actor_with_flexible_retrieval(
54 actor_id: str,
55 actor_input: dict,
56 *,
57 api_token: str,
58 retrieve_from: str = "auto"
59) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
60 """
61 Run an Apify actor and retrieve its results from dataset or key-value store.
62
63 This function starts an Apify actor with the provided input, waits for the run
64 to complete, and returns data from either the dataset or key-value store based
65 on the actor type or explicit configuration.
66
67 Args:
68 actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')
69 actor_input: Dictionary containing the input configuration for the actor
70 api_token: Apify API token for authentication (keyword-only argument)
71 retrieve_from: Where to retrieve data from. Options:
72 - "auto": Automatically detect based on actor_id
73 - "dataset": Retrieve from dataset only
74 - "key-value-store": Retrieve from key-value store only
75 - "both": Retrieve from both and return combined results
76
77 Returns:
78 - If retrieve_from is "dataset": List of dictionaries from dataset
79 - If retrieve_from is "key-value-store": Dictionary with key-value store items
80 - If retrieve_from is "both": Dictionary with 'dataset' and 'key_value_store' keys
81 - If retrieve_from is "auto": Appropriate format based on actor type
82 Returns None if the run fails.
83
84 Raises:
85 ValueError: If actor_id, api_token is empty, or retrieve_from is invalid
86
87 Example:
88 >>> input_data = {"startUrls": [{"url": "https://example.com"}]}
89 >>> # Auto-detect storage type
90 >>> results = run_apify_actor_with_flexible_retrieval(
91 ... "apify/website-content-crawler",
92 ... input_data,
93 ... api_token="your_token"
94 ... )
95 >>> # Explicitly use key-value store
96 >>> results = run_apify_actor_with_flexible_retrieval(
97 ... "apify/web-scraper",
98 ... input_data,
99 ... api_token="your_token",
100 ... retrieve_from="key-value-store"
101 ... )
102 """
103 # Input validation
104 if not actor_id or not actor_id.strip():
105 raise ValueError("actor_id cannot be empty")
106 if not api_token or not api_token.strip():
107 raise ValueError("api_token cannot be empty")
108 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:
109 raise ValueError("retrieve_from must be 'auto', 'dataset', 'key-value-store', or 'both'")
110
111 # Configure logging
112 logger = logging.getLogger(__name__)
113
114 # Determine storage type based on actor if auto mode
115 if retrieve_from == "auto":
116 if "website-content-crawler" in actor_id:
117 retrieve_from = "key-value-store"
118 else:
119 retrieve_from = "dataset"
120
121 try:
122 # Initialize the Apify client
123 client = ApifyClient(api_token)
124
125 logger.info(f"Starting Apify actor: {actor_id}")
126
127 # Start the actor run
128 run = client.actor(actor_id).call(run_input=actor_input)
129
130 # Check if the run was created successfully
131 if not run:
132 logger.error(f"Failed to start actor run for {actor_id}")
133 return None
134
135 run_id = run.get('id')
136 status = run.get('status')
137
138 logger.info(f"Actor run started with ID: {run_id}, Status: {status}")
139
140 # Check the final status of the run
141 if status != 'SUCCEEDED':
142 logger.warning(
143 f"Actor run {run_id} did not succeed. "
144 f"Final status: {status}. "
145 f"Exit code: {run.get('exitCode', 'N/A')}"
146 )
147 return None
148
149 logger.info(f"Actor run {run_id} completed successfully")
150
151 # Retrieve data based on the specified method
152 if retrieve_from == "dataset":
153 dataset_client = client.dataset(run.get('defaultDatasetId'))
154 dataset_items = list(dataset_client.iterate_items())
155 logger.info(f"Retrieved {len(dataset_items)} items from dataset")
156 return dataset_items
157
158 elif retrieve_from == "key-value-store":
159 kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))
160
161 # List all keys in the key-value store
162 keys_response = kv_store_client.list_keys()
163 keys = [item['key'] for item in keys_response.get('items', [])]
164
165 if not keys:
166 logger.warning("No keys found in key-value store")
167 return {}
168
169 # Retrieve all key-value pairs
170 kv_items = {}
171 for key in keys:
172 try:
173 value = kv_store_client.get_record(key)
174 if value:
175 kv_items[key] = value.get('value')
176 except Exception as e:
177 logger.warning(f"Failed to retrieve key '{key}': {str(e)}")
178
179 logger.info(f"Retrieved {len(kv_items)} items from key-value store")
180 return kv_items
181
182 elif retrieve_from == "both":
183 # Retrieve from both sources
184 results = {"dataset": [], "key_value_store": {}}
185
186 # Get dataset items
187 try:
188 dataset_client = client.dataset(run.get('defaultDatasetId'))
189 dataset_items = list(dataset_client.iterate_items())
190 results["dataset"] = dataset_items
191 logger.info(f"Retrieved {len(dataset_items)} items from dataset")
192 except Exception as e:
193 logger.warning(f"Failed to retrieve dataset items: {str(e)}")
194
195 # Get key-value store items
196 try:
197 kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))
198 keys_response = kv_store_client.list_keys()
199 keys = [item['key'] for item in keys_response.get('items', [])]
200
201 kv_items = {}
202 for key in keys:
203 try:
204 value = kv_store_client.get_record(key)
205 if value:
206 kv_items[key] = value.get('value')
207 except Exception as e:
208 logger.warning(f"Failed to retrieve key '{key}': {str(e)}")
209
210 results["key_value_store"] = kv_items
211 logger.info(f"Retrieved {len(kv_items)} items from key-value store")
212 except Exception as e:
213 logger.warning(f"Failed to retrieve key-value store items: {str(e)}")
214
215 return results
216
217 except Exception as e:
218 logger.error(f"Error running Apify actor {actor_id}: {str(e)}", exc_info=True)
219 return None

manual_scraping/multi_actor_scraper.py

1"""
2Multi-Actor Website Scraper
3
4This module provides functionality to scrape websites using multiple Apify actors
5simultaneously and return a dictionary of actor-scraped HTML pairs.
6"""
7
8import asyncio
9import logging
10from typing import Dict, List, Optional, Any
11from concurrent.futures import ThreadPoolExecutor, as_completed
12import json
13
14from .apify_runner import run_apify_actor_with_flexible_retrieval, run_apify_actor
15
16
17class MultiActorScraper:
18 """
19 A class to scrape websites using multiple Apify actors simultaneously.
20 """
21
22 def __init__(self, api_token: str):
23 """
24 Initialize the MultiActorScraper.
25
26 Args:
27 api_token: Apify API token for authentication
28 """
29 self.api_token = api_token
30 self.logger = logging.getLogger(__name__)
31
32 # Configure logging
33 logging.basicConfig(
34 level=logging.INFO,
35 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
36 )
37
38 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:
39 """
40 Get the configuration for each actor with the target URL.
41
42 Args:
43 target_url: The URL to scrape
44
45 Returns:
46 Dictionary mapping actor names to their configurations
47 """
48 return {
49 "cheerio-scraper": {
50 "actor_id": "apify/cheerio-scraper",
51 "input": {
52 "debugLog": False,
53 "forceResponseEncoding": False,
54 "ignoreSslErrors": False,
55 "keepUrlFragments": False,
56 "pageFunction": """async function pageFunction(context) {
57 const { $, request, log } = context;
58
59 const url = request.url;
60 const html = $.html(); // Get the full HTML of the page
61
62 log.info('Page scraped', { url });
63
64 return {
65 url,
66 html
67 };
68}""",
69 "postNavigationHooks": """// We need to return array of (possibly async) functions here.
70// The functions accept a single argument: the "crawlingContext" object.
71[
72 async (crawlingContext) => {
73 // ...
74 },
75]""",
76 "preNavigationHooks": """// We need to return array of (possibly async) functions here.
77// The functions accept two arguments: the "crawlingContext" object
78// and "requestAsBrowserOptions" which are passed to the `requestAsBrowser()`
79// function the crawler calls to navigate..
80[
81 async (crawlingContext, requestAsBrowserOptions) => {
82 // ...
83 }
84]""",
85 "proxyConfiguration": {
86 "useApifyProxy": True
87 },
88 "respectRobotsTxtFile": False,
89 "startUrls": [
90 {
91 "url": target_url,
92 "method": "GET"
93 }
94 ]
95 }
96 },
97
98 "website-content-crawler": {
99 "actor_id": "apify/website-content-crawler",
100 "input": {
101 "aggressivePrune": False,
102 "clickElementsCssSelector": "[aria-expanded=\"false\"]",
103 "clientSideMinChangePercentage": 15,
104 "crawlerType": "playwright:adaptive",
105 "debugLog": False,
106 "debugMode": False,
107 "dynamicContentWaitSecs": 15,
108 "expandIframes": True,
109 "ignoreCanonicalUrl": False,
110 "keepUrlFragments": False,
111 "maxCrawlDepth": 0,
112 "proxyConfiguration": {
113 "useApifyProxy": True,
114 "apifyProxyGroups": [
115 "RESIDENTIAL"
116 ]
117 },
118 "readableTextCharThreshold": 100,
119 "removeCookieWarnings": True,
120 "renderingTypeDetectionPercentage": 10,
121 "respectRobotsTxtFile": False,
122 "saveFiles": False,
123 "saveHtml": False,
124 "saveHtmlAsFile": True,
125 "saveMarkdown": False,
126 "saveScreenshots": False,
127 "startUrls": [
128 {
129 "url": target_url,
130 "method": "GET"
131 }
132 ],
133 "useSitemaps": False
134 }
135 },
136
137 "web-scraper": {
138 "actor_id": "apify/web-scraper",
139 "input": {
140 "breakpointLocation": "NONE",
141 "browserLog": False,
142 "closeCookieModals": False,
143 "debugLog": False,
144 "downloadCss": True,
145 "downloadMedia": True,
146 "headless": True,
147 "ignoreCorsAndCsp": False,
148 "ignoreSslErrors": False,
149 "injectJQuery": True,
150 "keepUrlFragments": False,
151 "pageFunction": """// The function accepts a single argument: the "context" object.
152// For a complete list of its properties and functions,
153// see https://apify.com/apify/web-scraper#page-function
154async function pageFunction(context) {
155 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
156 // debugger;
157
158 // jQuery is handy for finding DOM elements and extracting data from them.
159 // To use it, make sure to enable the "Inject jQuery" option.
160 const $ = context.jQuery;
161 const url = context.request.url;
162 const html = $('html').html(); // Get the full HTML of the page
163
164 // Print some information to Actor log
165 context.log.info(`URL: ${url}, HTML length: ${html ? html.length : 0}`);
166
167 // Return an object with the data extracted from the page.
168 // It will be stored to the resulting dataset.
169 return {
170 url,
171 html
172 };
173}""",
174 "postNavigationHooks": """// We need to return array of (possibly async) functions here.
175// The functions accept a single argument: the "crawlingContext" object.
176[
177 async (crawlingContext) => {
178 // ...
179 },
180]""",
181 "preNavigationHooks": """// We need to return array of (possibly async) functions here.
182// The functions accept two arguments: the "crawlingContext" object
183// and "gotoOptions".
184[
185 async (crawlingContext, gotoOptions) => {
186 // ...
187 },
188]""",
189 "proxyConfiguration": {
190 "useApifyProxy": True
191 },
192 "respectRobotsTxtFile": False,
193 "runMode": "PRODUCTION",
194 "startUrls": [
195 {
196 "url": target_url,
197 "method": "GET"
198 }
199 ],
200 "useChrome": False,
201 "waitUntil": [
202 "networkidle2"
203 ]
204 }
205 }
206 }
207
208 def _run_single_actor(self, actor_name: str, actor_config: Dict[str, Any]) -> tuple[str, Optional[str]]:
209 """
210 Run a single actor and extract HTML content.
211
212 Args:
213 actor_name: Name of the actor
214 actor_config: Configuration for the actor
215
216 Returns:
217 Tuple of (actor_name, html_content or None)
218 """
219 try:
220 self.logger.info(f"Starting {actor_name}...")
221
222 # Determine retrieval method based on actor type
223 if "website-content-crawler" in actor_config["actor_id"]:
224 retrieve_from = "key-value-store"
225 else:
226 retrieve_from = "dataset"
227
228 results = run_apify_actor_with_flexible_retrieval(
229 actor_config["actor_id"],
230 actor_config["input"],
231 api_token=self.api_token,
232 retrieve_from=retrieve_from
233 )
234
235 if not results:
236 self.logger.warning(f"{actor_name} returned no results")
237 return actor_name, None
238
239 # Extract HTML from results
240 html_content = None
241
242 # Handle different result formats
243 if isinstance(results, list):
244 # Dataset results (list format)
245 for item in results:
246 if isinstance(item, dict):
247 # For cheerio-scraper and web-scraper
248 if 'html' in item:
249 html_content = item['html']
250 break
251 # For other actors that might use different keys
252 elif 'text' in item:
253 html_content = item['text']
254 break
255 elif 'content' in item:
256 html_content = item['content']
257 break
258 # If it's a string (sometimes the whole result is HTML)
259 elif isinstance(item, str):
260 html_content = item
261 break
262
263 elif isinstance(results, dict):
264 # Key-value store results (dict format) - for website-content-crawler
265 # Look for HTML files in the key-value store
266 for key, value in results.items():
267 if key.endswith('.html') or 'html' in key.lower():
268 if isinstance(value, str):
269 html_content = value
270 break
271 elif isinstance(value, dict) and 'value' in value:
272 html_content = value['value']
273 break
274
275 # If no HTML file found, look for other common keys
276 if not html_content:
277 for key, value in results.items():
278 if isinstance(value, str) and len(value) > 100: # Likely HTML content
279 html_content = value
280 break
281 elif isinstance(value, dict):
282 # Check if the value dict contains HTML
283 if 'html' in value:
284 html_content = value['html']
285 break
286 elif 'content' in value:
287 html_content = value['content']
288 break
289
290 if html_content:
291 self.logger.info(f"{actor_name} completed successfully - HTML length: {len(html_content)}")
292 else:
293 self.logger.warning(f"{actor_name} completed but no HTML content found in results")
294 # Log the structure of the first result for debugging
295 if isinstance(results, list) and results:
296 self.logger.debug(f"{actor_name} result structure: {list(results[0].keys()) if isinstance(results[0], dict) else type(results[0])}")
297 elif isinstance(results, dict):
298 self.logger.debug(f"{actor_name} key-value store keys: {list(results.keys())}")
299
300 return actor_name, html_content
301
302 except Exception as e:
303 self.logger.error(f"Error running {actor_name}: {str(e)}", exc_info=True)
304 return actor_name, None
305
306 def scrape_with_multiple_actors(self, target_url: str, max_workers: int = 4) -> Dict[str, Optional[str]]:
307 """
308 Scrape a website using multiple Apify actors simultaneously.
309
310 Args:
311 target_url: The URL to scrape
312 max_workers: Maximum number of concurrent workers
313
314 Returns:
315 Dictionary mapping actor names to their scraped HTML content
316 """
317 self.logger.info(f"Starting multi-actor scraping for URL: {target_url}")
318
319 actor_configs = self._get_actor_configs(target_url)
320 results = {}
321
322 with ThreadPoolExecutor(max_workers=max_workers) as executor:
323 # Submit all actor runs
324 future_to_actor = {
325 executor.submit(self._run_single_actor, actor_name, config): actor_name
326 for actor_name, config in actor_configs.items()
327 }
328
329 # Collect results as they complete
330 for future in as_completed(future_to_actor):
331 actor_name = future_to_actor[future]
332 try:
333 actor_name_result, html_content = future.result()
334 results[actor_name_result] = html_content
335
336 except Exception as e:
337 self.logger.error(f"Error getting result for {actor_name}: {str(e)}")
338 results[actor_name] = None
339
340 # Log summary
341 successful_actors = [name for name, content in results.items() if content is not None]
342 failed_actors = [name for name, content in results.items() if content is None]
343
344 self.logger.info(f"Scraping completed!")
345 self.logger.info(f"Successful actors: {successful_actors}")
346 if failed_actors:
347 self.logger.warning(f"Failed actors: {failed_actors}")
348
349 return results
350
351 def save_results_to_files(self, results: Dict[str, Optional[str]], output_dir: str = "scraped_results") -> None:
352 """
353 Save the scraped HTML results to separate files.
354
355 Args:
356 results: Dictionary of actor-HTML pairs
357 output_dir: Directory to save the files
358 """
359 import os
360
361 if not os.path.exists(output_dir):
362 os.makedirs(output_dir)
363
364 for actor_name, html_content in results.items():
365 if html_content:
366 filename = f"{actor_name.replace('/', '_')}_result.html"
367 filepath = os.path.join(output_dir, filename)
368
369 with open(filepath, 'w', encoding='utf-8') as f:
370 f.write(html_content)
371
372 self.logger.info(f"Saved {actor_name} result to {filepath}")
373 else:
374 self.logger.warning(f"No content to save for {actor_name}")
375
376 def scrape_with_single_actor_flexible(target_url: str, actor_id: str, api_token: str,
377 custom_input: Optional[Dict[str, Any]] = None) -> Optional[str]:
378 """
379 Convenience function to scrape a website using a single Apify actor with flexible retrieval.
380
381 This function automatically detects whether to use dataset or key-value store based on the actor type.
382
383 Args:
384 target_url: The URL to scrape
385 actor_id: The Apify actor ID (e.g., 'apify/website-content-crawler')
386 api_token: Apify API token
387 custom_input: Optional custom input configuration for the actor
388
389 Returns:
390 HTML content as string if successful, None otherwise
391
392 Example:
393 >>> # For website-content-crawler (uses key-value store)
394 >>> html = scrape_with_single_actor_flexible(
395 ... "https://example.com",
396 ... "apify/website-content-crawler",
397 ... api_token
398 ... )
399 >>> # For web-scraper (uses dataset)
400 >>> html = scrape_with_single_actor_flexible(
401 ... "https://example.com",
402 ... "apify/web-scraper",
403 ... api_token
404 ... )
405 """
406 logger = logging.getLogger(__name__)
407
408 try:
409 # Use default input if none provided
410 if custom_input is None:
411 if "website-content-crawler" in actor_id:
412 actor_input = {
413 "aggressivePrune": False,
414 "clickElementsCssSelector": "[aria-expanded=\"false\"]",
415 "clientSideMinChangePercentage": 15,
416 "crawlerType": "playwright:adaptive",
417 "debugLog": False,
418 "debugMode": False,
419 "dynamicContentWaitSecs": 15,
420 "expandIframes": True,
421 "ignoreCanonicalUrl": False,
422 "keepUrlFragments": False,
423 "proxyConfiguration": {
424 "useApifyProxy": True,
425 "apifyProxyGroups": ["RESIDENTIAL"]
426 },
427 "readableTextCharThreshold": 100,
428 "removeCookieWarnings": True,
429 "renderingTypeDetectionPercentage": 10,
430 "respectRobotsTxtFile": False,
431 "saveFiles": False,
432 "saveHtml": False,
433 "saveHtmlAsFile": True,
434 "saveMarkdown": False,
435 "saveScreenshots": False,
436 "startUrls": [{"url": target_url, "method": "GET"}],
437 "useSitemaps": False
438 }
439 else:
440 # Default input for other actors
441 actor_input = {
442 "startUrls": [{"url": target_url, "method": "GET"}],
443 "proxyConfiguration": {"useApifyProxy": True}
444 }
445 else:
446 actor_input = custom_input
447
448 # Determine retrieval method based on actor type
449 if "website-content-crawler" in actor_id:
450 retrieve_from = "key-value-store"
451 else:
452 retrieve_from = "dataset"
453
454 # Run the actor with flexible retrieval
455 results = run_apify_actor_with_flexible_retrieval(
456 actor_id=actor_id,
457 actor_input=actor_input,
458 api_token=api_token,
459 retrieve_from=retrieve_from
460 )
461
462 if not results:
463 logger.warning(f"No results returned from {actor_id}")
464 return None
465
466 # Extract HTML content
467 html_content = None
468
469 if isinstance(results, list):
470 # Dataset results
471 for item in results:
472 if isinstance(item, dict) and 'html' in item:
473 html_content = item['html']
474 break
475 elif isinstance(results, dict):
476 # Key-value store results
477 for key, value in results.items():
478 if key.endswith('.html') or 'html' in key.lower():
479 if isinstance(value, str):
480 html_content = value
481 break
482 elif isinstance(value, dict) and 'value' in value:
483 html_content = value['value']
484 break
485
486 if html_content:
487 logger.info(f"Successfully scraped {len(html_content)} characters from {target_url}")
488 else:
489 logger.warning(f"No HTML content found in results from {actor_id}")
490 logger.debug(f"Result structure: {type(results)} with keys: {list(results.keys()) if isinstance(results, dict) else 'N/A'}")
491
492 return html_content
493
494 except Exception as e:
495 logger.error(f"Error scraping with {actor_id}: {str(e)}", exc_info=True)
496 return None
497
498
499def scrape_website_with_multiple_actors(target_url: str, api_token: str) -> Dict[str, Optional[str]]:
500 """
501 Convenience function to scrape a website using multiple Apify actors.
502
503 Args:
504 target_url: The URL to scrape
505 api_token: Apify API token
506
507 Returns:
508 Dictionary mapping actor names to their scraped HTML content
509 """
510 scraper = MultiActorScraper(api_token)
511 return scraper.scrape_with_multiple_actors(target_url)
512
513
514if __name__ == "__main__":
515 # Example usage
516 import os
517
518 # Get API token from environment variable
519 api_token = os.getenv("APIFY_TOKEN")
520 if not api_token:
521 print("Please set the APIFY_TOKEN environment variable")
522 exit(1)
523
524 # Example URL to scrape
525 target_url = "https://fbref.com/en/players/3d1f29d9/matchlogs/2021-2022/Jordyn-Huitema-Match-Logs"
526
527 # Create scraper and run
528 scraper = MultiActorScraper(api_token)
529 results = scraper.scrape_with_multiple_actors(target_url)
530
531 # Print results summary
532 print("\n=== SCRAPING RESULTS ===")
533 for actor_name, html_content in results.items():
534 if html_content:
535 print(f"{actor_name}: SUCCESS (HTML length: {len(html_content)})")
536 else:
537 print(f"{actor_name}: FAILED")
538
539 # Optionally save results to files
540 scraper.save_results_to_files(results)

tests/examples.json

[
{
"name": "Use Case 1",
"page_url": "https://books.toscrape.com/",
"goal": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock."
},
{
"name": "Use Case 2",
"page_url": "https://www.theverge.com/",
"goal": "I want to scrape the main articles from The Verge homepage. For each article, get me the headline, the author's name, and the link to the full article."
},
{
"name": "Use Case 3",
"page_url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
"goal": "Get me information about the Python programming language from its Wikipedia page. I need the 'First appeared' date, the 'Stable release' version, and the official website from the infobox on the right."
},
{
"name": "Use Case 4",
"page_url": "https://www.python.org/jobs/",
"goal": "List all the jobs posted. For each job, I want the job title, the company name, the location, and the date it was posted."
},
{
"name": "Use Case 5",
"page_url": "https://quotes.toscrape.com/",
"goal": "I want a list of all quotes on this page. For each one, get the quote text itself, the name of the author, and a list of the tags associated with it."
}
]

src/llmscraper/__init__.py

1"""
2ScraperCodeGenerator - Intelligent Web Scraping with AI
3
4A smart web scraping framework that uses multiple scraping strategies
5and AI-powered quality evaluation to extract data from websites.
6"""
7
8from .pipeline import IntelligentScraperPipeline, run_intelligent_scraper
9from .models import ScrapingResult, GoalExtractionResult
10from .main import main, extract_goal_from_prompt
11from .scraping.actor_multi_scraper import ActorMultiScraper
12
13__version__ = "0.1.0"
14__all__ = [
15 "IntelligentScraperPipeline",
16 "run_intelligent_scraper",
17 "ScrapingResult",
18 "GoalExtractionResult",
19 "main",
20 "extract_goal_from_prompt",
21 "ActorMultiScraper"
22]

src/llmscraper/main.py

1"""
2Main entry point and natural language processing for LLMScraper.
3"""
4
5import argparse
6import logging
7import re
8import sys
9from typing import Optional
10
11import anthropic
12
13from .models import ScrapingResult, GoalExtractionResult
14from .pipeline import run_intelligent_scraper
15from .utils import get_api_key, setup_logging
16
17
18def extract_goal_from_prompt(prompt: str, claude_api_key: str) -> GoalExtractionResult:
19 """
20 Extract URL and scraping goal from a natural language prompt.
21
22 Args:
23 prompt: Natural language prompt containing URL and goal
24 claude_api_key: Claude API key for processing
25
26 Returns:
27 GoalExtractionResult with extracted information
28 """
29 try:
30 client = anthropic.Anthropic(api_key=claude_api_key)
31
32 system_prompt = """You are an expert at parsing natural language requests for web scraping.
33Extract the URL and scraping goal from the user's prompt.
34
35Return your answer in this EXACT JSON format:
36{
37 "url": "extracted_url_here",
38 "goal": "clear_extraction_goal_here"
39}
40
41The goal should be specific and actionable for web scraping (e.g., "Get all product names and prices" rather than "scrape this site").
42
43Only return the JSON, no other text."""
44
45 response = client.messages.create(
46 model="claude-3-5-sonnet-20241022",
47 max_tokens=500,
48 messages=[
49 {
50 "role": "user",
51 "content": f"Extract the URL and scraping goal from this prompt: {prompt}"
52 }
53 ],
54 system=system_prompt
55 )
56
57 content = response.content[0].text
58
59 # Extract JSON from response
60 json_match = re.search(r'\{.*\}', content, re.DOTALL)
61 if not json_match:
62 return GoalExtractionResult(
63 goal="",
64 url="",
65 success=False,
66 error_message="Could not extract JSON from Claude response"
67 )
68
69 import json
70 data = json.loads(json_match.group())
71
72 url = data.get('url', '').strip()
73 goal = data.get('goal', '').strip()
74
75 if not url or not goal:
76 return GoalExtractionResult(
77 goal=goal,
78 url=url,
79 success=False,
80 error_message="Missing URL or goal in extracted data"
81 )
82
83 return GoalExtractionResult(
84 goal=goal,
85 url=url,
86 success=True
87 )
88
89 except Exception as e:
90 return GoalExtractionResult(
91 goal="",
92 url="",
93 success=False,
94 error_message=f"Error extracting goal from prompt: {str(e)}"
95 )
96
97
98def main():
99 """Main CLI entry point."""
100 parser = argparse.ArgumentParser(
101 description="LLMScraper - Intelligent Web Scraping with AI",
102 formatter_class=argparse.RawDescriptionHelpFormatter,
103 epilog="""
104Examples:
105 # Direct URL and goal
106 llmscraper --url https://example.com --goal "Get all product names and prices"
107
108 # Natural language prompt
109 llmscraper --prompt "I want to scrape all book titles and prices from books.toscrape.com"
110
111 # Actor mode (for Apify)
112 llmscraper --url https://example.com --goal "Extract news headlines" --actor
113
114 # Test the generated script
115 llmscraper --url https://example.com --goal "Get product info" --test
116 """
117 )
118
119 # Input options
120 input_group = parser.add_mutually_exclusive_group(required=True)
121 input_group.add_argument(
122 '--prompt',
123 help='Natural language prompt with URL and extraction goal'
124 )
125 input_group.add_argument(
126 '--url',
127 help='Target URL to scrape (use with --goal)'
128 )
129
130 parser.add_argument(
131 '--goal',
132 help='Extraction goal (required when using --url)'
133 )
134
135 # API keys
136 parser.add_argument(
137 '--apify-token',
138 help='Apify API token (or set APIFY_TOKEN env var)'
139 )
140 parser.add_argument(
141 '--claude-key',
142 help='Claude API key (or set CLAUDE_API_KEY env var)'
143 )
144
145 # Output options
146 parser.add_argument(
147 '--output', '-o',
148 default='generated_scraper.py',
149 help='Output file path for generated script (default: generated_scraper.py)'
150 )
151 parser.add_argument(
152 '--actor',
153 action='store_true',
154 help='Generate script for Apify actor format'
155 )
156
157 # Execution options
158 parser.add_argument(
159 '--test',
160 action='store_true',
161 help='Test the generated script before saving'
162 )
163 parser.add_argument(
164 '--no-prune-eval',
165 action='store_true',
166 help='Disable HTML pruning before quality evaluation'
167 )
168
169 # Logging
170 parser.add_argument(
171 '--verbose', '-v',
172 action='store_true',
173 help='Enable verbose logging'
174 )
175 parser.add_argument(
176 '--quiet', '-q',
177 action='store_true',
178 help='Suppress non-error output'
179 )
180
181 args = parser.parse_args()
182
183 # Setup logging
184 if args.quiet:
185 log_level = "ERROR"
186 elif args.verbose:
187 log_level = "DEBUG"
188 else:
189 log_level = "INFO"
190
191 setup_logging(log_level)
192 logger = logging.getLogger(__name__)
193
194 # Validate arguments
195 if args.url and not args.goal:
196 logger.error("--goal is required when using --url")
197 return 1
198
199 # Get API keys
200 apify_token = get_api_key("APIFY_TOKEN", args.apify_token)
201 claude_key = get_api_key("CLAUDE_API_KEY", args.claude_key)
202
203 if not apify_token:
204 logger.error("APIFY_TOKEN is required (set environment variable or use --apify-token)")
205 return 1
206
207 if not claude_key:
208 logger.error("CLAUDE_API_KEY is required (set environment variable or use --claude-key)")
209 return 1
210
211 # Process input
212 if args.prompt:
213 logger.info("🔍 Extracting URL and goal from natural language prompt...")
214 extraction_result = extract_goal_from_prompt(args.prompt, claude_key)
215
216 if not extraction_result.success:
217 logger.error(f"Failed to extract goal from prompt: {extraction_result.error_message}")
218 return 1
219
220 target_url = extraction_result.url
221 user_goal = extraction_result.goal
222
223 logger.info(f"📍 Extracted URL: {target_url}")
224 logger.info(f"🎯 Extracted Goal: {user_goal}")
225
226 else: # args.url and args.goal
227 target_url = args.url
228 user_goal = args.goal
229
230 # Run the pipeline
231 logger.info("🚀 Starting intelligent scraper pipeline...")
232
233 result = run_intelligent_scraper(
234 target_url=target_url,
235 user_goal=user_goal,
236 apify_token=apify_token,
237 claude_api_key=claude_key,
238 output_path=None if args.actor else args.output,
239 prune_before_evaluation=not args.no_prune_eval,
240 test_script=args.test,
241 for_actor=args.actor
242 )
243
244 # Handle results
245 if result.success:
246 logger.info("✅ SUCCESS! Pipeline completed successfully!")
247 logger.info(f"🏆 Best actor: {result.best_actor}")
248
249 if not args.actor:
250 logger.info(f"📁 Generated script: {args.output}")
251
252 if result.quality_scores:
253 logger.info("📊 Quality scores:")
254 for actor, score in result.quality_scores.items():
255 emoji = "🟢" if score >= 7 else "🟡" if score >= 4 else "🔴"
256 logger.info(f" {emoji} {actor}: {score}/10")
257
258 if args.test and result.extracted_data:
259 data_count = len(result.extracted_data) if isinstance(result.extracted_data, list) else 1
260 logger.info(f"🧪 Test extraction successful: {data_count} items")
261
262 if args.actor:
263 print("\\n" + "="*60)
264 print("📄 GENERATED ACTOR SCRIPT")
265 print("="*60)
266 print(result.generated_script)
267 else:
268 logger.info("\\n🚀 Next steps:")
269 logger.info(f" 1. Review the generated script: {args.output}")
270 logger.info(f" 2. Run the scraper: python {args.output}")
271 logger.info(" 3. Check the extracted JSON data")
272
273 return 0
274
275 else:
276 logger.error(f"❌ FAILED: {result.error_message}")
277 if result.quality_scores:
278 logger.info("📊 Quality scores achieved:")
279 for actor, score in result.quality_scores.items():
280 emoji = "🟢" if score >= 7 else "🟡" if score >= 4 else "🔴"
281 logger.info(f" {emoji} {actor}: {score}/10")
282 return 1
283
284
285if __name__ == "__main__":
286 sys.exit(main())

src/llmscraper/models.py

1"""
2Data models for the ScraperCodeGenerator pipeline.
3"""
4
5from dataclasses import dataclass
6from typing import Dict, Any, Optional, List
7
8
9@dataclass
10class ScrapingResult:
11 """Result of the complete scraping pipeline."""
12 success: bool
13 generated_script: Optional[str] = None
14 best_actor: Optional[str] = None
15 schema: Optional[Dict[str, Any]] = None
16 error_message: Optional[str] = None
17 quality_scores: Optional[Dict[str, int]] = None
18 extracted_data: Optional[List[Dict[str, Any]]] = None
19
20
21@dataclass
22class EvaluationResult:
23 """Result of HTML quality evaluation."""
24 score: int # 1-10 scale
25 reasoning: str
26
27
28@dataclass
29class PreEvaluationResult:
30 """Result of pre-evaluation checks before sending to Claude."""
31 is_valid_html: bool
32 score: Optional[int] = None # If we can determine score without Claude
33 reasoning: Optional[str] = None
34 should_continue_to_claude: bool = True
35
36
37@dataclass
38class GoalExtractionResult:
39 """Result of extracting goal from natural language prompt."""
40 goal: str
41 url: str
42 success: bool
43 error_message: Optional[str] = None

src/llmscraper/pipeline.py

1"""
2Main pipeline for intelligent web scraping.
3"""
4
5import logging
6from typing import Optional
7
8from .models import ScrapingResult
9from .scraping import MultiActorScraper
10from .scraping.actor_multi_scraper import ActorMultiScraper
11from .evaluation import HTMLQualityEvaluator
12from .generation import ScriptGenerator, ScriptExecutor
13from .utils import prune_html, validate_required_keys, get_api_key
14
15
16class IntelligentScraperPipeline:
17 """Main pipeline class that orchestrates the intelligent web scraping process."""
18
19 def __init__(self, apify_token: str, claude_api_key: str, actor_logger=None):
20 """
21 Initialize the pipeline with required API tokens.
22
23 Args:
24 apify_token: Apify API token for web scraping
25 claude_api_key: Anthropic Claude API key for AI analysis
26 actor_logger: Optional Actor logger for actor mode
27 """
28 # Validate API keys
29 validated_keys = validate_required_keys(
30 apify_token=apify_token,
31 claude_api_key=claude_api_key
32 )
33
34 self.apify_token = validated_keys['apify_token']
35 self.claude_api_key = validated_keys['claude_api_key']
36
37 # Initialize components
38 self.multi_scraper = MultiActorScraper(self.apify_token)
39 self.actor_scraper = ActorMultiScraper() # For actor-to-actor communication
40 self.quality_evaluator = HTMLQualityEvaluator(self.claude_api_key)
41 self.script_generator = ScriptGenerator(self.claude_api_key)
42 self.script_executor = ScriptExecutor()
43
44 # Setup logging - use Actor logger if provided, otherwise standard logging
45 self.logger = actor_logger if actor_logger else logging.getLogger(__name__)
46 self.is_actor_mode = actor_logger is not None
47
48 async def run_complete_pipeline(self, target_url: str, user_goal: str,
49 output_script_path: Optional[str] = None,
50 prune_before_evaluation: bool = True,
51 test_script: bool = False,
52 for_actor: bool = False) -> ScrapingResult:
53 """
54 Run the complete intelligent scraping pipeline.
55
56 Args:
57 target_url: The URL to scrape
58 user_goal: Natural language description of what to extract
59 output_script_path: Path where to save the generated script (None for actor mode)
60 prune_before_evaluation: If True, prune HTML before quality evaluation
61 test_script: If True, test the generated script before finalizing
62 for_actor: If True, generate script for Apify actor format
63
64 Returns:
65 ScrapingResult containing the outcome and generated artifacts
66 """
67 self.logger.info(f"PIPELINE: Starting intelligent scraping pipeline for: {target_url}")
68 self.logger.info(f"PIPELINE: User goal: {user_goal}")
69 self.logger.info(f"PIPELINE: Actor mode: {for_actor}")
70
71 try:
72 # Step 1: Run multiple actors to scrape the website
73 self.logger.info("PIPELINE: Step 1: Running multi-actor scraping...")
74
75 # Use actor-aware scraper if running inside an Apify actor
76 if for_actor:
77 self.logger.info("PIPELINE: Using actor-to-actor communication...")
78 scraping_results = await self.actor_scraper.scrape_with_multiple_actors(target_url)
79 else:
80 self.logger.info("PIPELINE: Using client-based scraping...")
81 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url)
82
83 if not any(content for content in scraping_results.values() if content):
84 return ScrapingResult(
85 success=False,
86 error_message="All scraping actors failed to retrieve content"
87 )
88
89 # Step 2: Evaluate quality of each result
90 self.logger.info("PIPELINE: Step 2: Evaluating HTML quality for each actor...")
91 quality_scores, best_actor, best_html = self._evaluate_html_quality(
92 scraping_results, user_goal, prune_before_evaluation
93 )
94
95 if not best_html:
96 return ScrapingResult(
97 success=False,
98 error_message="No actor produced quality HTML content",
99 quality_scores=quality_scores
100 )
101
102 self.logger.info(f"PIPELINE: Best actor selected: {best_actor} with score {quality_scores[best_actor]}/10")
103
104 # Step 3: Prune the best HTML to reduce token count
105 self.logger.info("PIPELINE: Step 3: Pruning HTML content...")
106 pruned_html = prune_html(best_html, max_list_items=4, max_text_length=500)
107
108 original_length = len(best_html)
109 pruned_length = len(pruned_html)
110 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0
111
112 self.logger.info(f"PIPELINE: HTML pruned: {original_length:,}{pruned_length:,} chars ({reduction:.1f}% reduction)")
113
114 # Step 4: Generate Python scraping script
115 self.logger.info("PIPELINE: Step 4: Generating Python scraping script...")
116 generated_script = self.script_generator.generate_scraping_script(
117 target_url, best_actor, pruned_html, user_goal, for_actor
118 )
119
120 if not generated_script:
121 return ScrapingResult(
122 success=False,
123 error_message="Failed to generate scraping script",
124 best_actor=best_actor,
125 quality_scores=quality_scores
126 )
127
128 # Step 5: Test the script if requested
129 extracted_data = None
130 if test_script:
131 self.logger.info("PIPELINE: Step 5: Testing generated script...")
132 test_result = self.script_executor.test_script(generated_script, best_html)
133
134 if test_result["success"]:
135 self.logger.info(f"PIPELINE: ✅ Script test passed! Extracted {test_result.get('item_count', 0)} items")
136 extracted_data = test_result["data"]
137 else:
138 self.logger.warning(f"PIPELINE: ⚠️ Script test failed: {test_result['error']}")
139 # Continue anyway, but log the issue
140
141 # Step 6: Save the generated script (only if not actor mode)
142 if output_script_path and not for_actor:
143 self.logger.info(f"PIPELINE: Step 6: Saving generated script to {output_script_path}")
144 with open(output_script_path, 'w', encoding='utf-8') as f:
145 f.write(generated_script)
146
147 self.logger.info("PIPELINE: ✅ Pipeline completed successfully!")
148
149 return ScrapingResult(
150 success=True,
151 generated_script=generated_script,
152 best_actor=best_actor,
153 quality_scores=quality_scores,
154 extracted_data=extracted_data
155 )
156
157 except Exception as e:
158 self.logger.error(f"PIPELINE: Pipeline failed with error: {str(e)}")
159 return ScrapingResult(
160 success=False,
161 error_message=f"Pipeline error: {str(e)}"
162 )
163
164 def _evaluate_html_quality(self, scraping_results: dict, user_goal: str,
165 prune_before_evaluation: bool) -> tuple[dict, str, str]:
166 """Evaluate HTML quality for each scraping result."""
167 quality_scores = {}
168 best_actor = None
169 best_html = None
170 best_score = 0
171
172 for actor_name, html_content in scraping_results.items():
173 if html_content:
174 self.logger.info(f"PIPELINE: Evaluating {actor_name}...")
175
176 # Optionally prune HTML before evaluation
177 evaluation_html = html_content
178 if prune_before_evaluation:
179 original_length = len(html_content)
180 evaluation_html = prune_html(html_content, max_list_items=3, max_text_length=100)
181 pruned_length = len(evaluation_html)
182 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0
183 self.logger.info(f"PIPELINE: {actor_name} HTML pruned for evaluation: {original_length:,}{pruned_length:,} chars ({reduction:.1f}% reduction)")
184
185 evaluation = self.quality_evaluator.evaluate_html_quality(user_goal, evaluation_html)
186
187 if evaluation:
188 quality_scores[actor_name] = evaluation.score
189 self.logger.info(f"PIPELINE: {actor_name} quality score: {evaluation.score}/10 - {evaluation.reasoning}")
190
191 if evaluation.score > best_score:
192 best_score = evaluation.score
193 best_actor = actor_name
194 best_html = html_content # Keep original HTML, not pruned version
195 else:
196 quality_scores[actor_name] = 0
197 self.logger.warning(f"PIPELINE: Failed to evaluate {actor_name}")
198 else:
199 quality_scores[actor_name] = 0
200 self.logger.warning(f"PIPELINE: {actor_name} returned no content")
201
202 return quality_scores, best_actor, best_html
203
204
205async def run_intelligent_scraper(target_url: str, user_goal: str,
206 apify_token: Optional[str] = None,
207 claude_api_key: Optional[str] = None,
208 output_path: Optional[str] = "generated_scraper.py",
209 prune_before_evaluation: bool = True,
210 test_script: bool = False,
211 for_actor: bool = False,
212 actor_logger=None) -> ScrapingResult:
213 """
214 Convenience function to run the complete intelligent scraping pipeline.
215
216 Args:
217 target_url: URL to scrape
218 user_goal: Natural language description of extraction goal
219 apify_token: Apify API token (uses APIFY_TOKEN env var if not provided)
220 claude_api_key: Claude API key (uses CLAUDE_API_KEY env var if not provided)
221 output_path: Path to save the generated script (None for actor mode)
222 prune_before_evaluation: If True, prune HTML before quality evaluation
223 test_script: If True, test the generated script before finalizing
224 for_actor: If True, generate script for Apify actor format
225 actor_logger: Optional Actor logger for actor mode
226
227 Returns:
228 ScrapingResult with the outcome
229 """
230 # Get tokens from environment if not provided
231 if not apify_token:
232 apify_token = get_api_key("APIFY_TOKEN")
233 if not claude_api_key:
234 claude_api_key = get_api_key("CLAUDE_API_KEY")
235
236 if not apify_token:
237 return ScrapingResult(
238 success=False,
239 error_message="APIFY_TOKEN not provided and not found in environment variables"
240 )
241
242 if not claude_api_key:
243 return ScrapingResult(
244 success=False,
245 error_message="CLAUDE_API_KEY not provided and not found in environment variables"
246 )
247
248 # Create and run pipeline
249 pipeline = IntelligentScraperPipeline(apify_token, claude_api_key, actor_logger)
250 return await pipeline.run_complete_pipeline(
251 target_url, user_goal, output_path, prune_before_evaluation, test_script, for_actor
252 )

src/llmscraper/tempCodeRunnerFile.py

1generated_scraper

src/llmscraper/evaluation/__init__.py

1"""
2Evaluation module for ScraperCodeGenerator.
3"""
4
5from .html_quality_evaluator import HTMLQualityEvaluator
6
7__all__ = ["HTMLQualityEvaluator"]

src/llmscraper/evaluation/html_quality_evaluator.py

1"""
2HTML quality evaluation using Claude AI.
3"""
4
5import json
6import logging
7import re
8from typing import Optional
9
10import anthropic
11
12from ..models import EvaluationResult, PreEvaluationResult
13
14
15class HTMLQualityEvaluator:
16 """Evaluates HTML quality for web scraping using Claude AI."""
17
18 def __init__(self, claude_api_key: str):
19 """Initialize with Claude API key."""
20 if not claude_api_key or not claude_api_key.strip():
21 raise ValueError("Claude API key cannot be empty")
22
23 self.client = anthropic.Anthropic(api_key=claude_api_key)
24 self.logger = logging.getLogger(__name__)
25
26 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
27 """
28 Evaluate HTML quality for data extraction.
29
30 Args:
31 user_goal: User's extraction goal
32 html_content: HTML content to evaluate
33
34 Returns:
35 EvaluationResult or None if evaluation fails
36 """
37 try:
38 # Pre-evaluation checks
39 pre_eval = self._pre_evaluate_html(html_content)
40 if not pre_eval.should_continue_to_claude:
41 if pre_eval.score is not None:
42 return EvaluationResult(score=pre_eval.score, reasoning=pre_eval.reasoning)
43 return None
44
45 # Claude evaluation
46 return self._evaluate_with_claude(user_goal, html_content)
47
48 except Exception as e:
49 self.logger.error(f"Error evaluating HTML quality: {str(e)}")
50 return None
51
52 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:
53 """Perform basic HTML validation checks."""
54 if not html_content or not html_content.strip():
55 return PreEvaluationResult(
56 is_valid_html=False,
57 score=1,
58 reasoning="Empty or whitespace-only HTML content",
59 should_continue_to_claude=False
60 )
61
62 # Check for common failure indicators
63 content_lower = html_content.lower()
64
65 # Bot detection/blocking indicators
66 blocking_indicators = [
67 'please verify you are a human',
68 'access denied',
69 'blocked',
70 'captcha',
71 'cloudflare',
72 'ddos protection',
73 'security check',
74 'bot detected'
75 ]
76
77 for indicator in blocking_indicators:
78 if indicator in content_lower:
79 return PreEvaluationResult(
80 is_valid_html=False,
81 score=1,
82 reasoning=f"HTML appears to be blocked/bot-detected (found: '{indicator}')",
83 should_continue_to_claude=False
84 )
85
86 # Check for minimal HTML structure
87 if not re.search(r'<html|<body|<div|<p|<span', content_lower):
88 return PreEvaluationResult(
89 is_valid_html=False,
90 score=2,
91 reasoning="HTML lacks basic structural elements",
92 should_continue_to_claude=False
93 )
94
95 return PreEvaluationResult(
96 is_valid_html=True,
97 should_continue_to_claude=True
98 )
99
100 def _evaluate_with_claude(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
101 """Evaluate HTML using Claude AI."""
102 try:
103 prompt = self._create_evaluation_prompt(user_goal, html_content)
104
105 response = self.client.messages.create(
106 model="claude-3-5-sonnet-20241022",
107 max_tokens=500,
108 messages=[{"role": "user", "content": prompt}]
109 )
110
111 content = response.content[0].text
112 return self._parse_evaluation_response(content)
113
114 except Exception as e:
115 self.logger.error(f"Error in Claude evaluation: {str(e)}")
116 return None
117
118 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:
119 """Create the evaluation prompt for Claude."""
120 return f"""You are an expert web scraper evaluator. Analyze the provided HTML and determine how suitable it is for extracting the requested data.
121
122USER EXTRACTION GOAL:
123{user_goal}
124
125HTML CONTENT TO EVALUATE:
126{html_content}
127
128Evaluate the HTML on a scale of 1-10 based on:
1291. Presence of the target data elements
1302. HTML structure quality and accessibility
1313. Whether the page loaded correctly (not blocked, error page, etc.)
1324. How easy it would be to extract the requested data
133
134Return your evaluation in this EXACT JSON format:
135{{
136 "score": [1-10 integer],
137 "reasoning": "[brief explanation of the score]"
138}}
139
140Only return the JSON, no other text.
141"""
142
143 def _parse_evaluation_response(self, response: str) -> Optional[EvaluationResult]:
144 """Parse Claude's evaluation response."""
145 try:
146 # Extract JSON from response
147 json_match = re.search(r'\{.*\}', response, re.DOTALL)
148 if not json_match:
149 raise ValueError("No JSON found in response")
150
151 data = json.loads(json_match.group())
152
153 score = data.get('score')
154 reasoning = data.get('reasoning', '')
155
156 if not isinstance(score, int) or score < 1 or score > 10:
157 raise ValueError(f"Invalid score: {score}")
158
159 return EvaluationResult(score=score, reasoning=reasoning)
160
161 except Exception as e:
162 self.logger.error(f"Error parsing evaluation response: {str(e)}")
163 return None

src/llmscraper/generation/__init__.py

1"""
2Generation module for ScraperCodeGenerator.
3"""
4
5from .script_generator import ScriptGenerator
6from .script_executor import ScriptExecutor
7
8__all__ = ["ScriptGenerator", "ScriptExecutor"]

src/llmscraper/generation/script_executor.py

1"""
2Script execution and testing functionality.
3"""
4
5import subprocess
6import tempfile
7import os
8import json
9import logging
10from typing import Dict, Any, Optional
11import ast
12import traceback
13
14
15class ScriptExecutor:
16 """Executes and tests generated scraping scripts."""
17
18 def __init__(self):
19 """Initialize the script executor."""
20 self.logger = logging.getLogger(__name__)
21
22 def test_script(self, script_content: str, html_content: str) -> Dict[str, Any]:
23 """
24 Test a scraping script against sample HTML content.
25
26 Args:
27 script_content: The Python script to test
28 html_content: Sample HTML to test against
29
30 Returns:
31 Dict with test results including success, data, and errors
32 """
33 try:
34 # Extract the extract_data function from the script
35 extract_function = self._extract_function_from_script(script_content, 'extract_data')
36
37 if not extract_function:
38 return {
39 "success": False,
40 "error": "Could not find extract_data function in script",
41 "data": None
42 }
43
44 # Create a safe execution environment
45 safe_globals = {
46 '__builtins__': {
47 'len': len,
48 'str': str,
49 'int': int,
50 'float': float,
51 'bool': bool,
52 'list': list,
53 'dict': dict,
54 'range': range,
55 'enumerate': enumerate,
56 'zip': zip,
57 'isinstance': isinstance,
58 'hasattr': hasattr,
59 'getattr': getattr,
60 'print': print,
61 }
62 }
63
64 # Import necessary modules into the environment
65 exec("from bs4 import BeautifulSoup", safe_globals)
66 exec("import re", safe_globals)
67 exec("import json", safe_globals)
68
69 # Execute the function definition
70 exec(extract_function, safe_globals)
71
72 # Call the function with the HTML content
73 extracted_data = safe_globals['extract_data'](html_content)
74
75 return {
76 "success": True,
77 "data": extracted_data,
78 "error": None,
79 "data_type": type(extracted_data).__name__,
80 "item_count": len(extracted_data) if isinstance(extracted_data, (list, dict)) else 1
81 }
82
83 except Exception as e:
84 self.logger.error(f"Error testing script: {str(e)}")
85 return {
86 "success": False,
87 "error": str(e),
88 "data": None,
89 "traceback": traceback.format_exc()
90 }
91
92 def _extract_function_from_script(self, script_content: str, function_name: str) -> Optional[str]:
93 """Extract a specific function from a script."""
94 try:
95 # Parse the script into an AST
96 tree = ast.parse(script_content)
97
98 # Find the function definition
99 for node in ast.walk(tree):
100 if isinstance(node, ast.FunctionDef) and node.name == function_name:
101 # Get the source code of the function
102 lines = script_content.split('\n')
103 start_line = node.lineno - 1
104
105 # Find the end of the function
106 end_line = start_line + 1
107 while end_line < len(lines):
108 line = lines[end_line]
109 # Check if this line starts a new function or class
110 if line.strip() and not line.startswith(' ') and not line.startswith('\t'):
111 break
112 end_line += 1
113
114 return '\n'.join(lines[start_line:end_line])
115
116 return None
117
118 except Exception as e:
119 self.logger.error(f"Error extracting function: {str(e)}")
120 return None
121
122 def validate_script_syntax(self, script_content: str) -> Dict[str, Any]:
123 """
124 Validate the syntax of a Python script.
125
126 Args:
127 script_content: The Python script to validate
128
129 Returns:
130 Dict with validation results
131 """
132 try:
133 # Try to parse the script
134 ast.parse(script_content)
135
136 return {
137 "valid": True,
138 "error": None
139 }
140
141 except SyntaxError as e:
142 return {
143 "valid": False,
144 "error": f"Syntax error: {str(e)}",
145 "line": e.lineno,
146 "offset": e.offset
147 }
148 except Exception as e:
149 return {
150 "valid": False,
151 "error": f"Parse error: {str(e)}"
152 }
153
154 def run_script_in_sandbox(self, script_content: str, timeout: int = 60) -> Dict[str, Any]:
155 """
156 Run a complete script in a sandboxed environment.
157
158 Args:
159 script_content: The complete Python script
160 timeout: Maximum execution time in seconds
161
162 Returns:
163 Dict with execution results
164 """
165 try:
166 # Create a temporary file
167 with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:
168 temp_file.write(script_content)
169 temp_file_path = temp_file.name
170
171 try:
172 # Run the script
173 result = subprocess.run(
174 ['python', temp_file_path],
175 capture_output=True,
176 text=True,
177 timeout=timeout,
178 cwd=os.path.dirname(temp_file_path)
179 )
180
181 return {
182 "success": result.returncode == 0,
183 "stdout": result.stdout,
184 "stderr": result.stderr,
185 "return_code": result.returncode
186 }
187
188 finally:
189 # Clean up the temporary file
190 os.unlink(temp_file_path)
191
192 except subprocess.TimeoutExpired:
193 return {
194 "success": False,
195 "stdout": "",
196 "stderr": f"Script execution timed out after {timeout} seconds",
197 "return_code": -1
198 }
199 except Exception as e:
200 return {
201 "success": False,
202 "stdout": "",
203 "stderr": str(e),
204 "return_code": -1
205 }

src/llmscraper/generation/script_generator.py

1"""
2Code generation functionality for creating scraping scripts.
3"""
4
5import logging
6from typing import Optional
7import re
8
9import anthropic
10
11
12class ScriptGenerator:
13 """Generates Python scraping scripts using Claude AI."""
14
15 def __init__(self, claude_api_key: str):
16 """Initialize with Claude API key."""
17 if not claude_api_key or not claude_api_key.strip():
18 raise ValueError("Claude API key cannot be empty")
19
20 self.client = anthropic.Anthropic(api_key=claude_api_key)
21 self.logger = logging.getLogger(__name__)
22
23 def generate_scraping_script(self, target_url: str, best_actor: str,
24 pruned_html: str, user_goal: str,
25 for_actor: bool = False) -> Optional[str]:
26 """
27 Generate a complete Python scraping script.
28
29 Args:
30 target_url: The target URL to scrape
31 best_actor: Name of the best performing actor
32 pruned_html: Sample HTML content for reference
33 user_goal: User's extraction goal
34 for_actor: If True, generate for Apify actor (key-value store output)
35
36 Returns:
37 Complete Python script as string, or None if generation fails
38 """
39 try:
40 # Generate the HTML parsing code from Claude
41 parsing_code = self._generate_html_parsing_code(pruned_html, user_goal)
42
43 if not parsing_code:
44 self.logger.error("Failed to generate HTML parsing code")
45 return None
46
47 # Create the complete script
48 if for_actor:
49 return self._create_actor_script(target_url, best_actor, parsing_code, user_goal)
50 else:
51 return self._create_standalone_script(target_url, best_actor, parsing_code, user_goal)
52
53 except Exception as e:
54 self.logger.error(f"Error generating scraping script: {str(e)}")
55 return None
56
57 def _generate_html_parsing_code(self, pruned_html: str, user_goal: str) -> Optional[str]:
58 """Generate HTML parsing/extraction code using Claude."""
59 try:
60 prompt = f"""Generate Python code that parses HTML content and extracts data based on the user's goal. You should generate ONLY the extraction logic, not a complete script.
61
62## USER GOAL:
63{user_goal}
64
65## SAMPLE HTML (for reference):
66{pruned_html}
67
68## REQUIREMENTS:
691. Create a function called `extract_data(html_content)` that takes HTML string as input
702. Use BeautifulSoup to parse the HTML
713. Extract the data according to the user's goal using CSS selectors, attributes, text content, etc.
724. Return the extracted data as a Python dictionary or list of dictionaries
735. Handle missing or malformed data gracefully
746. Include appropriate error handling
75
76## EXAMPLE OUTPUT FORMAT:
77```python
78def extract_data(html_content):
79 from bs4 import BeautifulSoup
80
81 soup = BeautifulSoup(html_content, 'html.parser')
82 results = []
83
84 # Your extraction logic here
85 # Use soup.find(), soup.find_all(), CSS selectors, etc.
86
87 return results
88```
89
90Generate ONLY the `extract_data` function and any helper functions needed. Do not include imports outside the function, full scripts, or other boilerplate code."""
91
92 self.logger.info("Requesting HTML parsing code generation from Claude...")
93
94 response = self.client.messages.create(
95 model="claude-3-5-sonnet-20241022",
96 max_tokens=2000,
97 messages=[{"role": "user", "content": prompt}]
98 )
99
100 parsing_code = response.content[0].text
101
102 # Extract Python code from response if wrapped in code blocks
103 if "```python" in parsing_code:
104 code_match = re.search(r'```python\n(.*?)\n```', parsing_code, re.DOTALL)
105 if code_match:
106 parsing_code = code_match.group(1)
107
108 return parsing_code
109
110 except Exception as e:
111 self.logger.error(f"Error generating HTML parsing code: {str(e)}")
112 return None
113
114 def _create_standalone_script(self, target_url: str, best_actor: str,
115 parsing_code: str, user_goal: str) -> str:
116 """Create a standalone Python script."""
117 return f'''#!/usr/bin/env python3
118"""
119Generated Web Scraper
120Target: {target_url}
121Goal: {user_goal}
122Best Actor: {best_actor}
123Generated by: ScraperCodeGenerator
124"""
125
126import os
127import json
128import logging
129from typing import Dict, Any, List, Optional
130from bs4 import BeautifulSoup
131
132from llmscraper.scraping import MultiActorScraper
133
134
135{parsing_code}
136
137
138def main():
139 """Main function to run the scraper."""
140 # Setup logging
141 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
142 logger = logging.getLogger(__name__)
143
144 # Configuration
145 target_url = "{target_url}"
146 apify_token = os.getenv("APIFY_TOKEN")
147
148 if not apify_token:
149 logger.error("APIFY_TOKEN environment variable not set")
150 logger.info("Please set your Apify API token: export APIFY_TOKEN='your_token_here'")
151 return
152
153 try:
154 logger.info(f"🚀 Starting scraper for: {{target_url}}")
155 logger.info(f"📝 Goal: {user_goal}")
156 logger.info(f"🏆 Using best actor: {best_actor}")
157
158 # Initialize the multi-actor scraper
159 scraper = MultiActorScraper(apify_token)
160
161 # Get the actor configuration for the best performing actor
162 actor_configs = scraper._get_actor_configs(target_url)
163
164 if "{best_actor}" not in actor_configs:
165 logger.error(f"Best actor '{best_actor}' not found in available actors")
166 return
167
168 # Run the best actor to get HTML
169 logger.info(f"🔄 Running {{'{best_actor}'}} actor...")
170 actor_name, html_content = scraper._run_single_actor("{best_actor}", actor_configs["{best_actor}"])
171
172 if not html_content:
173 logger.error(f"Failed to get HTML content from {{'{best_actor}'}} actor")
174 return
175
176 logger.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")
177
178 # Extract data using the generated parsing code
179 logger.info("🔍 Extracting data from HTML...")
180 extracted_data = extract_data(html_content)
181
182 if not extracted_data:
183 logger.warning("No data was extracted from the HTML")
184 return
185
186 # Prepare final results
187 results = {{
188 "target_url": target_url,
189 "extraction_goal": "{user_goal}",
190 "actor_used": "{best_actor}",
191 "data": extracted_data,
192 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1
193 }}
194
195 # Output results
196 print("\\n" + "="*60)
197 print("📊 EXTRACTION RESULTS")
198 print("="*60)
199 print(json.dumps(results, indent=2, ensure_ascii=False))
200
201 # Save to file
202 output_file = "extracted_data.json"
203 with open(output_file, 'w', encoding='utf-8') as f:
204 json.dump(results, f, indent=2, ensure_ascii=False)
205
206 logger.info(f"💾 Results saved to {{output_file}}")
207 logger.info(f"🎉 Successfully extracted {{results['total_items']}} items!")
208
209 except Exception as e:
210 logger.error(f"❌ Scraping failed: {{e}}")
211 import traceback
212 traceback.print_exc()
213
214
215if __name__ == "__main__":
216 main()
217'''
218
219 def _create_actor_script(self, target_url: str, best_actor: str,
220 parsing_code: str, user_goal: str) -> str:
221 """Create a script for Apify actor."""
222 return f'''"""
223Apify Actor Script
224Target: {target_url}
225Goal: {user_goal}
226Best Actor: {best_actor}
227Generated by: ScraperCodeGenerator
228"""
229
230import json
231from apify import Actor
232from bs4 import BeautifulSoup
233
234from llmscraper.scraping import MultiActorScraper
235
236
237{parsing_code}
238
239
240async def main():
241 """Main actor function."""
242 async with Actor:
243 # Get input
244 actor_input = await Actor.get_input() or {{}}
245 target_url = actor_input.get('targetUrl', '{target_url}')
246 user_goal = actor_input.get('userGoal', '{user_goal}')
247 apify_token = actor_input.get('apifyToken') or Actor.config.token
248
249 Actor.log.info(f"🚀 Starting scraper for: {{target_url}}")
250 Actor.log.info(f"📝 Goal: {{user_goal}}")
251 Actor.log.info(f"🏆 Using best actor: {best_actor}")
252
253 try:
254 # Initialize the multi-actor scraper
255 scraper = MultiActorScraper(apify_token)
256
257 # Run the best actor to get HTML
258 Actor.log.info(f"🔄 Running {best_actor} actor...")
259 actor_name, html_content = scraper._run_single_actor(
260 "{best_actor}",
261 scraper._get_actor_configs(target_url)["{best_actor}"]
262 )
263
264 if not html_content:
265 await Actor.fail(f"Failed to get HTML content from {best_actor} actor")
266 return
267
268 Actor.log.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")
269
270 # Extract data using the generated parsing code
271 Actor.log.info("🔍 Extracting data from HTML...")
272 extracted_data = extract_data(html_content)
273
274 if not extracted_data:
275 Actor.log.warning("No data was extracted from the HTML")
276 extracted_data = []
277
278 # Prepare final results
279 results = {{
280 "target_url": target_url,
281 "extraction_goal": user_goal,
282 "actor_used": "{best_actor}",
283 "data": extracted_data,
284 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1
285 }}
286
287 # Save to key-value store
288 await Actor.set_value('OUTPUT', results)
289
290 Actor.log.info(f"🎉 Successfully extracted {{results['total_items']}} items!")
291 Actor.log.info("💾 Results saved to key-value store as 'OUTPUT'")
292
293 except Exception as e:
294 Actor.log.error(f"❌ Scraping failed: {{e}}")
295 await Actor.fail(str(e))
296
297
298if __name__ == "__main__":
299 import asyncio
300 asyncio.run(main())
301'''

src/llmscraper/scraping/__init__.py

1"""
2Scraping module for ScraperCodeGenerator.
3"""
4
5from .apify_runner import ApifyRunner
6from .multi_actor_scraper import MultiActorScraper
7from .actor_multi_scraper import ActorMultiScraper
8
9__all__ = ["ApifyRunner", "MultiActorScraper", "ActorMultiScraper"]

src/llmscraper/scraping/actor_multi_scraper.py

1"""
2Apify Actor-specific scraping module for running other actors from within an Apify actor.
3"""
4
5import logging
6from typing import Dict, Any, List, Optional
7from apify import Actor
8
9
10class ActorMultiScraper:
11 """Handles running multiple Apify actors from within an Apify actor context."""
12
13 def __init__(self):
14 """Initialize the actor scraper."""
15 self.logger = logging.getLogger(__name__)
16
17 async def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, Optional[str]]:
18 """
19 Run multiple actors to scrape the target URL and return HTML content.
20
21 Args:
22 target_url: The URL to scrape
23
24 Returns:
25 Dictionary mapping actor names to their HTML content (or None if failed)
26 """
27 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors called for {target_url}")
28 results = {}
29
30 # Define actor configurations
31 actor_configs = self._get_actor_configs(target_url)
32 Actor.log.info(f"DEBUG: Will run {len(actor_configs)} actors: {list(actor_configs.keys())}")
33
34 # Run each actor and collect results
35 for actor_name, config in actor_configs.items():
36 try:
37 Actor.log.info(f"DEBUG: Starting {actor_name}...")
38 html_content = await self._run_single_actor(actor_name, config)
39 results[actor_name] = html_content
40
41 if html_content:
42 Actor.log.info(f"DEBUG: {actor_name} succeeded: {len(html_content):,} characters")
43 else:
44 Actor.log.warning(f"DEBUG: {actor_name} returned no content")
45
46 except Exception as e:
47 Actor.log.error(f"DEBUG: {actor_name} failed: {str(e)}")
48 results[actor_name] = None
49
50 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors completed. Results: {list(results.keys())}")
51 Actor.log.info(f"DEBUG: Results with content: {[name for name, content in results.items() if content]}")
52 return results
53
54 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:
55 """Get configurations for all actors to run."""
56 return {
57 "cheerio-scraper": {
58 "actor_id": "apify/cheerio-scraper",
59 "input": {
60 "startUrls": [{"url": target_url}],
61 "maxRequestRetries": 3,
62 "requestTimeoutSecs": 30,
63 "maxPagesPerCrawl": 1,
64 "pageFunction": """
65 async function pageFunction(context) {
66 const { request, log, $ } = context;
67 try {
68 const title = $('title').text() || '';
69 const html = $('html').html() || '';
70 return {
71 url: request.url,
72 title: title,
73 html: html
74 };
75 } catch (error) {
76 log.error('Error in pageFunction:', error);
77 return {
78 url: request.url,
79 title: '',
80 html: ''
81 };
82 }
83 }
84 """,
85 "proxyConfiguration": {"useApifyProxy": True}
86 }
87 },
88 "web-scraper": {
89 "actor_id": "apify/web-scraper",
90 "input": {
91 "startUrls": [{"url": target_url}],
92 "maxRequestRetries": 3,
93 "requestTimeoutSecs": 30,
94 "maxPagesPerCrawl": 1,
95 "pageFunction": """
96 async function pageFunction(context) {
97 const { request, log, page } = context;
98 try {
99 const title = await page.title();
100 const html = await page.content();
101 return { url: request.url, title, html };
102 } catch (error) {
103 log.error('Error in pageFunction:', error);
104 return { url: request.url, title: '', html: '' };
105 }
106 }
107 """,
108 "proxyConfiguration": {"useApifyProxy": True}
109 }
110 },
111 "website-content-crawler": {
112 "actor_id": "apify/website-content-crawler",
113 "input": {
114 "startUrls": [{"url": target_url}],
115 "maxRequestsPerCrawl": 1,
116 "maxCrawlDepth": 0,
117 "htmlTransformer": "readableText",
118 "readableTextCharThreshold": 100,
119 "removeCookieWarnings": True,
120 "clickElementsCssSelector": "",
121 "proxyConfiguration": {"useApifyProxy": True}
122 }
123 }
124 }
125
126 async def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:
127 """
128 Run a single actor and extract HTML content.
129
130 Args:
131 actor_name: Name of the actor (for logging)
132 config: Actor configuration including actor_id and input
133
134 Returns:
135 HTML content as string, or None if failed
136 """
137 try:
138 actor_id = config["actor_id"]
139 actor_input = config["input"]
140
141 Actor.log.info(f"DEBUG: Calling actor {actor_id}")
142
143 # Call the actor using Apify SDK - use the exact same pattern as working code
144 run = await Actor.call(
145 actor_id=actor_id,
146 run_input=actor_input
147 )
148
149 if not run:
150 Actor.log.error(f"DEBUG: Actor {actor_name} failed to start - run is None")
151 return None
152
153 Actor.log.info(f"DEBUG: Actor {actor_name} run created with ID: {run.id}")
154 Actor.log.info(f"DEBUG: Default dataset ID: {run.default_dataset_id}")
155
156 # Use the exact same pattern as your working code
157 if run.default_dataset_id:
158 try:
159 Actor.log.info(f"DEBUG: Getting dataset items for {actor_name}...")
160 items = (await Actor.apify_client.dataset(run.default_dataset_id).list_items()).items
161
162 if items:
163 Actor.log.info(f"DEBUG: Found {len(items)} items in dataset for {actor_name}")
164
165 for i, item in enumerate(items):
166 Actor.log.info(f"DEBUG: Dataset item {i} keys: {list(item.keys()) if isinstance(item, dict) else type(item)}")
167
168 # Look for HTML content in the item
169 html_content = self._extract_html_from_item(item, actor_name)
170 if html_content:
171 Actor.log.info(f"DEBUG: Found HTML content in dataset item {i} for {actor_name}: {len(html_content)} characters")
172 return html_content
173 else:
174 Actor.log.info(f"DEBUG: No dataset items found for {actor_name}")
175
176 except Exception as e:
177 Actor.log.error(f"DEBUG: Dataset retrieval failed for {actor_name}: {e}")
178 import traceback
179 Actor.log.error(f"DEBUG: Dataset traceback: {traceback.format_exc()}")
180
181 # Fallback: Try key-value store (simplified)
182 if run.default_key_value_store_id:
183 try:
184 Actor.log.info(f"DEBUG: Trying key-value store as fallback for {actor_name}...")
185 kvs_client = Actor.apify_client.key_value_store(run.default_key_value_store_id)
186
187 # Try common keys that might contain HTML
188 common_keys = ['OUTPUT', 'RESULTS', 'DATA']
189 for key_name in common_keys:
190 try:
191 record = await kvs_client.get_record(key_name)
192 if record:
193 Actor.log.info(f"DEBUG: Found record for key {key_name}")
194 html_content = self._extract_html_from_record(record, actor_name)
195 if html_content:
196 Actor.log.info(f"DEBUG: Found HTML content in key {key_name} for {actor_name}")
197 return html_content
198 except Exception:
199 pass # Key doesn't exist, continue
200
201 except Exception as e:
202 Actor.log.error(f"DEBUG: Key-value store retrieval failed for {actor_name}: {e}")
203
204 Actor.log.warning(f"DEBUG: No HTML content found for {actor_name}")
205 return None
206
207 except Exception as e:
208 Actor.log.error(f"DEBUG: Error running {actor_name}: {str(e)}")
209 import traceback
210 Actor.log.error(f"DEBUG: Full traceback: {traceback.format_exc()}")
211 return None
212
213 def _extract_html_from_item(self, item: Dict[str, Any], actor_name: str) -> Optional[str]:
214 """Extract HTML content from a dataset item."""
215 Actor.log.info(f"DEBUG: Extracting HTML from item for {actor_name}, item keys: {list(item.keys()) if isinstance(item, dict) else 'not a dict'}")
216
217 # Look for HTML in common fields
218 html_fields = ['html', 'content', 'body', 'pageContent', 'text', 'data']
219
220 for field in html_fields:
221 if field in item and item[field]:
222 content = item[field]
223 Actor.log.info(f"DEBUG: Found content in field '{field}': {type(content)}, length: {len(content) if isinstance(content, str) else 'N/A'}")
224
225 if isinstance(content, str) and len(content) > 100:
226 # Check if it looks like HTML
227 if '<' in content and '>' in content:
228 Actor.log.info(f"DEBUG: Found HTML content in field '{field}' for {actor_name}")
229 return content
230 elif actor_name == "website-content-crawler":
231 # For website-content-crawler, text content is also acceptable
232 Actor.log.info(f"DEBUG: Found text content in field '{field}' for {actor_name}")
233 html_content = f"<html><body><div>{content}</div></body></html>"
234 return html_content
235
236 # For website-content-crawler, look for any text-like content
237 if actor_name == "website-content-crawler":
238 for key, value in item.items():
239 if isinstance(value, str) and len(value) > 50:
240 Actor.log.info(f"DEBUG: Using text content from field '{key}' for website-content-crawler")
241 html_content = f"<html><body><div>{value}</div></body></html>"
242 return html_content
243
244 Actor.log.info(f"DEBUG: No HTML content found in item for {actor_name}")
245 return None
246
247 def _extract_html_from_record(self, record: Any, actor_name: str) -> Optional[str]:
248 """Extract HTML content from a key-value store record."""
249 try:
250 # The record might be the content directly or wrapped in a dict
251 content = record
252
253 if hasattr(record, 'value'):
254 content = record.value
255 elif isinstance(record, dict) and 'value' in record:
256 content = record['value']
257
258 # If content is a string, check if it's HTML
259 if isinstance(content, str):
260 if len(content) > 100 and ('<' in content or actor_name == "website-content-crawler"):
261 return content
262
263 # If content is a dict, look for HTML fields
264 elif isinstance(content, dict):
265 html_content = self._extract_html_from_item(content, actor_name)
266 if html_content:
267 return html_content
268
269 return None
270
271 except Exception as e:
272 Actor.log.debug(f"DEBUG: Error extracting HTML from record for {actor_name}: {e}")
273 return None

src/llmscraper/scraping/apify_runner.py

1"""
2Apify integration for running actors and retrieving results.
3"""
4
5import logging
6from typing import Optional, Dict, Any, List, Union
7
8from apify_client import ApifyClient
9
10
11class ApifyRunner:
12 """Handles running Apify actors and retrieving results."""
13
14 def __init__(self, api_token: str):
15 """Initialize with API token."""
16 if not api_token or not api_token.strip():
17 raise ValueError("API token cannot be empty")
18
19 self.client = ApifyClient(api_token)
20 self.logger = logging.getLogger(__name__)
21
22 def run_actor(self, actor_id: str, actor_input: dict,
23 retrieve_from: str = "auto") -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
24 """
25 Run an Apify actor and retrieve results.
26
27 Args:
28 actor_id: The ID of the Apify actor
29 actor_input: Input configuration for the actor
30 retrieve_from: "auto", "dataset", "key-value-store", or "both"
31
32 Returns:
33 Retrieved data or None if failed
34 """
35 if not actor_id or not actor_id.strip():
36 raise ValueError("actor_id cannot be empty")
37
38 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:
39 raise ValueError("Invalid retrieve_from option")
40
41 # Determine storage type
42 if retrieve_from == "auto":
43 retrieve_from = "key-value-store" if "website-content-crawler" in actor_id else "dataset"
44
45 try:
46 self.logger.info(f"Starting Apify actor: {actor_id}")
47
48 # Start the actor run
49 run = self.client.actor(actor_id).call(run_input=actor_input)
50
51 if not run or run.get('status') != 'SUCCEEDED':
52 self.logger.error(f"Actor run failed: {run.get('status') if run else 'No run created'}")
53 return None
54
55 run_id = run.get('id')
56 self.logger.info(f"Actor run {run_id} completed successfully")
57
58 # Retrieve results based on type
59 if retrieve_from == "dataset":
60 return self._get_dataset_items(run_id)
61 elif retrieve_from == "key-value-store":
62 return self._get_key_value_store_items(run_id)
63 elif retrieve_from == "both":
64 return {
65 "dataset": self._get_dataset_items(run_id),
66 "key_value_store": self._get_key_value_store_items(run_id)
67 }
68
69 except Exception as e:
70 self.logger.error(f"Error running actor {actor_id}: {str(e)}")
71 return None
72
73 def _get_dataset_items(self, run_id: str) -> List[Dict[str, Any]]:
74 """Get items from the dataset of a run."""
75 try:
76 dataset_id = self.client.run(run_id).get().get('defaultDatasetId')
77 if not dataset_id:
78 self.logger.warning(f"No dataset found for run {run_id}")
79 return []
80
81 dataset_items = list(self.client.dataset(dataset_id).iterate_items())
82 self.logger.info(f"Retrieved {len(dataset_items)} items from dataset")
83 return dataset_items
84
85 except Exception as e:
86 self.logger.error(f"Error retrieving dataset items: {str(e)}")
87 return []
88
89 def _get_key_value_store_items(self, run_id: str) -> Dict[str, Any]:
90 """Get items from the key-value store of a run."""
91 try:
92 kvs_id = self.client.run(run_id).get().get('defaultKeyValueStoreId')
93 if not kvs_id:
94 self.logger.warning(f"No key-value store found for run {run_id}")
95 return {}
96
97 kvs = self.client.key_value_store(kvs_id)
98 keys = list(kvs.list_keys())
99
100 items = {}
101 for key_info in keys:
102 # Handle case where key_info might be a string or dict
103 if isinstance(key_info, dict):
104 key_name = key_info.get('key')
105 else:
106 key_name = str(key_info)
107
108 if key_name:
109 try:
110 value = kvs.get_record(key_name)
111 if value:
112 # Handle case where value might be a string or dict
113 if isinstance(value, dict):
114 items[key_name] = value.get('value', value)
115 else:
116 items[key_name] = value
117 except Exception as e:
118 self.logger.warning(f"Failed to retrieve key '{key_name}': {str(e)}")
119
120 self.logger.info(f"Retrieved {len(items)} items from key-value store")
121 return items
122
123 except Exception as e:
124 self.logger.error(f"Error retrieving key-value store items: {str(e)}")
125 return {}
126
127
128# Legacy functions for backward compatibility
129def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:
130 """Legacy function - use ApifyRunner class instead."""
131 runner = ApifyRunner(api_token)
132 result = runner.run_actor(actor_id, actor_input, "dataset")
133 return result if isinstance(result, list) else None
134
135
136def run_apify_actor_with_flexible_retrieval(
137 actor_id: str, actor_input: dict, *, api_token: str, retrieve_from: str = "auto"
138) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
139 """Legacy function - use ApifyRunner class instead."""
140 runner = ApifyRunner(api_token)
141 return runner.run_actor(actor_id, actor_input, retrieve_from)

src/llmscraper/scraping/multi_actor_scraper.py

1"""
2Multi-actor scraping functionality.
3"""
4
5import logging
6from typing import Dict, Any
7from concurrent.futures import ThreadPoolExecutor, as_completed
8
9from .apify_runner import ApifyRunner
10
11
12class MultiActorScraper:
13 """Scrapes websites using multiple Apify actors simultaneously."""
14
15 def __init__(self, api_token: str):
16 """Initialize with Apify API token."""
17 self.api_token = api_token
18 self.runner = ApifyRunner(api_token)
19 self.logger = logging.getLogger(__name__)
20
21 def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, str]:
22 """
23 Scrape a URL with multiple actors and return HTML content.
24
25 Args:
26 target_url: URL to scrape
27
28 Returns:
29 Dict mapping actor names to HTML content
30 """
31 actor_configs = self._get_actor_configs(target_url)
32 results = {}
33
34 # Use ThreadPoolExecutor for concurrent execution
35 with ThreadPoolExecutor(max_workers=len(actor_configs)) as executor:
36 future_to_actor = {
37 executor.submit(self._run_single_actor, name, config): name
38 for name, config in actor_configs.items()
39 }
40
41 for future in as_completed(future_to_actor):
42 actor_name = future_to_actor[future]
43 try:
44 name, html_content = future.result()
45 results[name] = html_content
46 except Exception as e:
47 self.logger.error(f"Actor {actor_name} failed: {str(e)}")
48 results[actor_name] = None
49
50 return results
51
52 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:
53 """Get actor configurations for the target URL."""
54 return {
55 "cheerio-scraper": {
56 "actor_id": "apify/cheerio-scraper",
57 "input": {
58 "startUrls": [{"url": target_url}],
59 "maxRequestRetries": 3,
60 "requestTimeoutSecs": 30,
61 "maxRequestsPerCrawl": 1,
62 "pseudoUrls": [],
63 "linkSelector": "",
64 "pageFunction": """
65 async function pageFunction(context) {
66 const { request, log, skipLinks, $ } = context;
67 return {
68 url: request.url,
69 title: $('title').text(),
70 html: $('html').html()
71 };
72 }
73 """,
74 "proxyConfiguration": {"useApifyProxy": True}
75 }
76 },
77 "web-scraper": {
78 "actor_id": "apify/web-scraper",
79 "input": {
80 "startUrls": [{"url": target_url}],
81 "maxRequestRetries": 3,
82 "requestTimeoutSecs": 30,
83 "maxPagesPerCrawl": 1,
84 "pageFunction": """
85 async function pageFunction(context) {
86 const { request, log, skipLinks, $ } = context;
87 return {
88 url: request.url,
89 title: $('title').text(),
90 html: $('html').html()
91 };
92 }
93 """,
94 "proxyConfiguration": {"useApifyProxy": True}
95 }
96 },
97 "website-content-crawler": {
98 "actor_id": "apify/website-content-crawler",
99 "input": {
100 "startUrls": [{"url": target_url}],
101 "maxCrawlPages": 1,
102 "crawler": "playwright",
103 "proxyConfiguration": {"useApifyProxy": True}
104 }
105 }
106 }
107
108 def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> tuple[str, str]:
109 """
110 Run a single actor and extract HTML content.
111
112 Args:
113 actor_name: Name of the actor
114 config: Actor configuration
115
116 Returns:
117 Tuple of (actor_name, html_content)
118 """
119 try:
120 self.logger.info(f"Starting {actor_name}...")
121
122 result = self.runner.run_actor(
123 config["actor_id"],
124 config["input"],
125 "auto"
126 )
127
128 if not result:
129 self.logger.warning(f"{actor_name} returned no results")
130 return actor_name, None
131
132 # Extract HTML based on result type
133 html_content = self._extract_html_from_result(result, actor_name)
134
135 if html_content:
136 self.logger.info(f"{actor_name} completed successfully: {len(html_content):,} chars")
137 else:
138 self.logger.warning(f"{actor_name} returned no HTML content")
139
140 return actor_name, html_content
141
142 except Exception as e:
143 self.logger.error(f"Error running {actor_name}: {str(e)}")
144 return actor_name, None
145
146 def _extract_html_from_result(self, result: Any, actor_name: str) -> str:
147 """Extract HTML content from actor result."""
148 try:
149 if isinstance(result, list) and result:
150 # Dataset result
151 item = result[0]
152 return item.get('html') or item.get('content', '')
153 elif isinstance(result, dict):
154 # Key-value store result
155 if 'OUTPUT' in result:
156 output = result['OUTPUT']
157 if isinstance(output, dict):
158 return output.get('html') or output.get('content', '')
159 elif isinstance(output, str):
160 return output
161
162 self.logger.warning(f"Unexpected result format from {actor_name}")
163 return None
164
165 except Exception as e:
166 self.logger.error(f"Error extracting HTML from {actor_name}: {str(e)}")
167 return None

src/llmscraper/utils/__init__.py

1"""
2Utilities module for ScraperCodeGenerator.
3"""
4
5from .html_utils import is_html, prune_html, extract_text_content, validate_html_structure
6from .config import get_api_key, validate_required_keys, setup_logging
7
8__all__ = [
9 "is_html",
10 "prune_html",
11 "extract_text_content",
12 "validate_html_structure",
13 "get_api_key",
14 "validate_required_keys",
15 "setup_logging"
16]

src/llmscraper/utils/config.py

1"""
2Configuration and environment utilities.
3"""
4
5import os
6from typing import Optional
7
8
9def get_api_key(key_name: str, provided_key: Optional[str] = None) -> Optional[str]:
10 """
11 Get API key from provided value or environment variable.
12
13 Args:
14 key_name: Name of the environment variable
15 provided_key: Explicitly provided key (takes precedence)
16
17 Returns:
18 API key or None if not found
19 """
20 if provided_key and provided_key.strip():
21 return provided_key.strip()
22
23 return os.getenv(key_name)
24
25
26def validate_required_keys(**keys) -> dict[str, str]:
27 """
28 Validate that all required API keys are present.
29
30 Args:
31 **keys: Key-value pairs of key names and values
32
33 Returns:
34 Dict of validated keys
35
36 Raises:
37 ValueError: If any required key is missing
38 """
39 validated = {}
40 missing = []
41
42 for key_name, key_value in keys.items():
43 if not key_value or not key_value.strip():
44 missing.append(key_name)
45 else:
46 validated[key_name] = key_value.strip()
47
48 if missing:
49 raise ValueError(f"Missing required API keys: {', '.join(missing)}")
50
51 return validated
52
53
54def setup_logging(level: str = "INFO") -> None:
55 """
56 Setup logging configuration.
57
58 Args:
59 level: Logging level (DEBUG, INFO, WARNING, ERROR)
60 """
61 import logging
62
63 logging.basicConfig(
64 level=getattr(logging, level.upper()),
65 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
66 handlers=[
67 logging.StreamHandler()
68 ]
69 )

src/llmscraper/utils/html_utils.py

1"""
2HTML utility functions for processing web content.
3"""
4
5from typing import Optional
6from bs4 import BeautifulSoup, Comment, NavigableString
7import re
8
9
10def is_html(text_content: str) -> bool:
11 """
12 Check if a string is likely HTML content.
13
14 Args:
15 text_content: The text content to check
16
17 Returns:
18 True if the content appears to be HTML
19 """
20 if not text_content or not isinstance(text_content, str):
21 return False
22
23 content_lower = text_content.lower()
24 return '<html>' in content_lower and '<body>' in content_lower
25
26
27def prune_html(html_content: str, max_list_items: int = 5, max_text_length: int = 500) -> str:
28 """
29 Clean and shorten HTML content to reduce token count while preserving structure.
30
31 Args:
32 html_content: The raw HTML content to process
33 max_list_items: Maximum number of list items to keep
34 max_text_length: Maximum length of text content in any tag
35
36 Returns:
37 The cleaned and shortened HTML
38 """
39 if not html_content or not isinstance(html_content, str):
40 return ""
41
42 try:
43 soup = BeautifulSoup(html_content, 'html.parser')
44
45 # Remove unwanted tags entirely
46 unwanted_tags = ['script', 'style', 'svg', 'noscript', 'meta', 'link']
47 for tag_name in unwanted_tags:
48 for tag in soup.find_all(tag_name):
49 tag.decompose()
50
51 # Remove HTML comments
52 for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
53 comment.extract()
54
55 # Remove unwanted attributes from all tags
56 allowed_attributes = {'id', 'class', 'href', 'src', 'alt', 'title'}
57 for tag in soup.find_all(True):
58 if hasattr(tag, 'attrs'):
59 tag.attrs = {key: value for key, value in tag.attrs.items()
60 if key in allowed_attributes}
61
62 # Truncate lists and tables
63 list_and_table_tags = ['ul', 'ol', 'table', 'tbody', 'thead']
64 for tag_name in list_and_table_tags:
65 for tag in soup.find_all(tag_name):
66 children = list(tag.children)
67 # Filter out NavigableString objects (text nodes, whitespace)
68 non_text_children = [child for child in children if not isinstance(child, NavigableString)]
69
70 if len(non_text_children) > max_list_items:
71 # Keep only the first max_list_items children
72 for child in non_text_children[max_list_items:]:
73 child.decompose()
74
75 # Add a comment indicating truncation
76 if tag.name in ['ul', 'ol']:
77 truncation_notice = soup.new_tag("li")
78 truncation_notice.string = f"... ({len(non_text_children) - max_list_items} more items)"
79 tag.append(truncation_notice)
80 elif tag.name == 'table':
81 truncation_notice = soup.new_tag("tr")
82 td = soup.new_tag("td")
83 td.string = f"... ({len(non_text_children) - max_list_items} more rows)"
84 truncation_notice.append(td)
85 tag.append(truncation_notice)
86
87 # Truncate long text content
88 for element in soup.find_all(string=True):
89 if isinstance(element, NavigableString) and not isinstance(element, Comment):
90 text = str(element).strip()
91 if len(text) > max_text_length:
92 element.replace_with(text[:max_text_length] + "...")
93
94 # Return the cleaned HTML
95 return str(soup)
96
97 except Exception as e:
98 # If parsing fails, return original content truncated
99 return html_content[:max_text_length * 10] if len(html_content) > max_text_length * 10 else html_content
100
101
102def extract_text_content(html_content: str) -> str:
103 """
104 Extract clean text content from HTML.
105
106 Args:
107 html_content: HTML content to extract text from
108
109 Returns:
110 Clean text content
111 """
112 if not html_content:
113 return ""
114
115 try:
116 soup = BeautifulSoup(html_content, 'html.parser')
117 return soup.get_text(separator=' ', strip=True)
118 except Exception:
119 return html_content
120
121
122def validate_html_structure(html_content: str) -> bool:
123 """
124 Validate basic HTML structure.
125
126 Args:
127 html_content: HTML content to validate
128
129 Returns:
130 True if HTML has basic valid structure
131 """
132 if not html_content:
133 return False
134
135 try:
136 soup = BeautifulSoup(html_content, 'html.parser')
137
138 # Check for basic HTML elements
139 has_html_tag = soup.find('html') is not None
140 has_body_tag = soup.find('body') is not None
141 has_content = len(soup.get_text(strip=True)) > 0
142
143 return has_html_tag or has_body_tag or has_content
144
145 except Exception:
146 return False