
ScraperCodeGenerator
Pricing
Pay per usage
Go to Store

ScraperCodeGenerator
An intelligent web scraping tool that automatically generates custom scraping code for any website.
0.0 (0)
Pricing
Pay per usage
0
Total users
1
Monthly users
1
Last modified
a day ago
.gitignore
# --- General ---.DS_Store.env.env.*
# --- Logs ---logs*.lognpm-debug.log*yarn-debug.log*yarn-error.log*pnpm-debug.log*lerna-debug.log*
# --- IDEs ---.vscode/*!.vscode/extensions.json.idea/*.suo*.ntvs**.njsproj*.sln*.sw?
# --- Python ---__pycache__/*.py[cod]*$py.class*.so.Python.venv/venv/ENV/env/.env.env.*.pytest_cache/.coveragehtmlcov/.tox/.cache.mypy_cache/.dmypy.jsondmypy.json
# Project specificscraped_results/*.html
# --- Node.js / Frontend ---frontend/node_modules/frontend/dist/frontend/dist-ssr/frontend/.pnpfrontend/.pnp.jsfrontend/.npmnode_modulesdistdist-ssr*.local# Added by Apify CLIstorage.venv
# --- Apify ---storage/apify_storage/
# --- Local test files ---input.jsontest_*
.python-version
3.10
Dockerfile
# Use the official Apify Python base imageFROM apify/actor-python:3.11
# Copy requirements and install dependenciesCOPY requirements.txt ./RUN pip install --no-cache-dir -r requirements.txt
# Copy the source codeCOPY . ./
# Set the entrypointCMD ["python", "main.py"]
cli.py
1#!/usr/bin/env python32"""3CLI Entry Point for LLMScraper4"""5
6import sys7import os8
9# Add src to path for development10sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))11
12from llmscraper.main import main13
14if __name__ == "__main__":15 sys.exit(main())
main.py
1"""2Apify Actor entry point for LLMScraper.3"""4
5import asyncio6import os7import sys8
9# Add src to path10sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))11
12from apify import Actor13from llmscraper import run_intelligent_scraper14
15
16async def main():17 """Main actor function."""18 async with Actor:19 Actor.log.info("DEBUG: Inside Actor context")20 Actor.log.info("🚀 LLMScraper Actor starting...")21 22 # Get input23 actor_input = await Actor.get_input() or {}24 25 # Extract parameters26 target_url = actor_input.get('targetUrl')27 user_goal = actor_input.get('userGoal')28 claude_api_key = actor_input.get('claudeApiKey') 29 30 # Use actor token for Apify operations31 apify_token = Actor.config.token32 33 if not target_url:34 raise ValueError("targetUrl is required")35 36 if not user_goal:37 raise ValueError("userGoal is required")38 39 if not claude_api_key:40 raise ValueError("claudeApiKey is required")41 42 Actor.log.info(f"Target URL: {target_url}")43 Actor.log.info(f"User Goal: {user_goal}")44 45 Actor.log.info("DEBUG: About to call run_intelligent_scraper")46 47 try:48 # Run the intelligent scraper49 Actor.log.info("DEBUG: About to call run_intelligent_scraper with parameters:")50 Actor.log.info(f" - target_url: {target_url}")51 Actor.log.info(f" - user_goal: {user_goal}")52 Actor.log.info(f" - has claude_api_key: {bool(claude_api_key)}")53 Actor.log.info(f" - has apify_token: {bool(apify_token)}")54 55 results = await run_intelligent_scraper(56 target_url=target_url,57 user_goal=user_goal,58 claude_api_key=claude_api_key,59 apify_token=apify_token,60 for_actor=True, # Set for actor mode61 actor_logger=Actor.log, # Pass Actor logger62 test_script=True # Enable testing in actor mode63 )64 65 Actor.log.info("DEBUG: run_intelligent_scraper completed")66 Actor.log.info(f"DEBUG: Results type: {type(results)}")67 Actor.log.info(f"DEBUG: Results success: {getattr(results, 'success', 'No success attr')}")68 69 if hasattr(results, 'error_message') and results.error_message:70 Actor.log.error(f"DEBUG: Results error: {results.error_message}")71 72 if hasattr(results, 'quality_scores') and results.quality_scores:73 Actor.log.info(f"DEBUG: Quality scores: {results.quality_scores}")74 75 if hasattr(results, 'best_actor') and results.best_actor:76 Actor.log.info(f"DEBUG: Best actor: {results.best_actor}")77 78 if hasattr(results, 'generated_script') and results.generated_script:79 Actor.log.info(f"DEBUG: Generated script length: {len(results.generated_script)} characters")80 81 if hasattr(results, 'extracted_data') and results.extracted_data:82 Actor.log.info(f"DEBUG: Test extraction successful: {len(results.extracted_data) if isinstance(results.extracted_data, list) else 1} items")83 84 if results.success:85 Actor.log.info(f"Scraping completed successfully!")86 87 # Show the generated script in actor mode88 if hasattr(results, 'generated_script') and results.generated_script:89 Actor.log.info("\\n" + "="*60)90 Actor.log.info("📄 GENERATED ACTOR SCRIPT")91 Actor.log.info("="*60)92 Actor.log.info(results.generated_script)93 Actor.log.info("="*60)94 95 # Create output data96 output_data = {97 "url": target_url,98 "title": getattr(results, 'title', 'N/A'),99 "data": results.extracted_data if hasattr(results, 'extracted_data') else {},100 "generated_script": getattr(results, 'generated_script', ''),101 "best_actor": getattr(results, 'best_actor', ''),102 "quality_scores": getattr(results, 'quality_scores', {}),103 "timestamp": "2025-06-29T20:00:00Z", # You might want to use actual timestamp104 "success": True,105 "error": None106 }107 108 # Push results to default dataset109 await Actor.push_data([output_data])110 else:111 Actor.log.error(f"Scraping failed: {results.error_message}")112 113 # Still push error data for tracking114 error_data = {115 "url": target_url,116 "title": "N/A",117 "data": {},118 "timestamp": "2025-06-29T20:00:00Z",119 "success": False,120 "error": results.error_message121 }122 123 await Actor.push_data([error_data])124 125 Actor.log.info("✅ LLMScraper Actor finished successfully!")126 127 except Exception as e:128 Actor.log.error(f"❌ Error during scraping: {str(e)}")129 raise130
131
132if __name__ == "__main__":133 asyncio.run(main())
package.json
{ "name": "scraper-code-generator", "version": "0.1.0", "description": "Intelligent web scraping framework using AI-powered quality evaluation", "main": "main.py", "scripts": { "start": "python main.py" }, "dependencies": {}, "author": "", "license": "MIT"}
pyproject.toml
1[build-system]2requires = ["hatchling"]3build-backend = "hatchling.build"4
5[project]6name = "llmscraper"7version = "0.1.0"8description = "Intelligent web scraping framework using AI-powered quality evaluation and multiple scraping strategies"9readme = "README.md"10requires-python = ">=3.10"11dependencies = [12 "anthropic>=0.54.0",13 "apify-client>=1.11.0", 14 "beautifulsoup4>=4.12.0",15 "apify>=1.5.0",16]17
18[project.scripts]19llmscraper = "llmscraper.main:main"20
21[project.optional-dependencies]22dev = [23 "pytest>=7.0",24 "pytest-asyncio>=0.21.0",25 "black>=23.0",26 "isort>=5.12.0",27 "mypy>=1.5.0",28]29
30[tool.hatch.build.targets.wheel]31packages = ["src/llmscraper"]32
33[tool.hatch.build.targets.sdist]34include = [35 "/src",36 "/tests", 37 "/examples.json",38 "/README.md",39]
requirements.txt
1apify>=1.5.02apify-client>=1.5.03anthropic>=0.7.04beautifulsoup4>=4.12.0
uv.lock
version = 1revision = 2requires-python = ">=3.10"
[[package]]name = "annotated-types"version = "0.7.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },]
[[package]]name = "anthropic"version = "0.54.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "anyio" }, { name = "distro" }, { name = "httpx" }, { name = "jiter" }, { name = "pydantic" }, { name = "sniffio" }, { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/89/28/80cb9bb6e7ce77d404145b51da4257455805c17f0a6be528ff3286e3882f/anthropic-0.54.0.tar.gz", hash = "sha256:5e6f997d97ce8e70eac603c3ec2e7f23addeff953fbbb76b19430562bb6ba815", size = 312376, upload-time = "2025-06-11T02:46:27.642Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/de/b9/6ffb48e82c5e97b03cecee872d134a6b6666c2767b2d32ed709f3a60a8fe/anthropic-0.54.0-py3-none-any.whl", hash = "sha256:c1062a0a905daeec17ca9c06c401e4b3f24cb0495841d29d752568a1d4018d56", size = 288774, upload-time = "2025-06-11T02:46:25.578Z" },]
[[package]]name = "anyio"version = "4.9.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "idna" }, { name = "sniffio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" },]sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" },]
[[package]]name = "apify-client"version = "1.11.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "apify-shared" }, { name = "colorama" }, { name = "httpx" }, { name = "more-itertools" },]sdist = { url = "https://files.pythonhosted.org/packages/49/44/b7cae857f2129d4093bc5a0a2267fcbba7905207a0b7cc424dc3c7c90291/apify_client-1.11.0.tar.gz", hash = "sha256:c2e151754c35be9bc7c1028bf7cb127aeb1ffa2fbd1ec1ad7e97b901deb32e08", size = 346095, upload-time = "2025-06-13T11:46:39.129Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/8f/24/d3273bfe5b4a96fd60c8d554edbab99274fae8cb2347b96f2e3fa0bc4d5b/apify_client-1.11.0-py3-none-any.whl", hash = "sha256:9d691960bdbeee17624a2a82aafc4f0bfba9b48820a48f559b7eba76bf01cb3c", size = 82550, upload-time = "2025-06-13T11:46:37.483Z" },]
[[package]]name = "apify-shared"version = "1.4.1"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/b2/a6/c8e2fa0b3bdc479d3ecde778e2381af199f910cf7c8baa3c207bcfe26e47/apify_shared-1.4.1.tar.gz", hash = "sha256:16e617c840fd27bf38d980f079c0b867c7378f68c7006b3d5a7d530d43930507", size = 13871, upload-time = "2025-04-28T12:20:01.113Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/1d/f3/3446c8a7986fdc087024d4e174e4b3f587097a9b28f6f8e8c788199225b2/apify_shared-1.4.1-py3-none-any.whl", hash = "sha256:abac5712b6e8eb96693204cbb2702905e1971d9084b1716e7337852b5005290e", size = 12706, upload-time = "2025-04-28T12:19:59.792Z" },]
[[package]]name = "certifi"version = "2025.6.15"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" },]
[[package]]name = "colorama"version = "0.4.6"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },]
[[package]]name = "distro"version = "1.9.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },]
[[package]]name = "exceptiongroup"version = "1.3.0"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.13'" },]sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },]
[[package]]name = "h11"version = "0.16.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },]
[[package]]name = "httpcore"version = "1.0.9"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "certifi" }, { name = "h11" },]sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },]
[[package]]name = "httpx"version = "0.28.1"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "anyio" }, { name = "certifi" }, { name = "httpcore" }, { name = "idna" },]sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },]
[[package]]name = "idna"version = "3.10"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },]
[[package]]name = "jiter"version = "0.10.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/be/7e/4011b5c77bec97cb2b572f566220364e3e21b51c48c5bd9c4a9c26b41b67/jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303", size = 317215, upload-time = "2025-05-18T19:03:04.303Z" }, { url = "https://files.pythonhosted.org/packages/8a/4f/144c1b57c39692efc7ea7d8e247acf28e47d0912800b34d0ad815f6b2824/jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e", size = 322814, upload-time = "2025-05-18T19:03:06.433Z" }, { url = "https://files.pythonhosted.org/packages/63/1f/db977336d332a9406c0b1f0b82be6f71f72526a806cbb2281baf201d38e3/jiter-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8b3e0068c26ddedc7abc6fac37da2d0af16b921e288a5a613f4b86f050354f", size = 345237, upload-time = "2025-05-18T19:03:07.833Z" }, { url = "https://files.pythonhosted.org/packages/d7/1c/aa30a4a775e8a672ad7f21532bdbfb269f0706b39c6ff14e1f86bdd9e5ff/jiter-0.10.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:286299b74cc49e25cd42eea19b72aa82c515d2f2ee12d11392c56d8701f52224", size = 370999, upload-time = "2025-05-18T19:03:09.338Z" }, { url = "https://files.pythonhosted.org/packages/35/df/f8257abc4207830cb18880781b5f5b716bad5b2a22fb4330cfd357407c5b/jiter-0.10.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ed5649ceeaeffc28d87fb012d25a4cd356dcd53eff5acff1f0466b831dda2a7", size = 491109, upload-time = "2025-05-18T19:03:11.13Z" }, { url = "https://files.pythonhosted.org/packages/06/76/9e1516fd7b4278aa13a2cc7f159e56befbea9aa65c71586305e7afa8b0b3/jiter-0.10.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2ab0051160cb758a70716448908ef14ad476c3774bd03ddce075f3c1f90a3d6", size = 388608, upload-time = "2025-05-18T19:03:12.911Z" }, { url = "https://files.pythonhosted.org/packages/6d/64/67750672b4354ca20ca18d3d1ccf2c62a072e8a2d452ac3cf8ced73571ef/jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03997d2f37f6b67d2f5c475da4412be584e1cec273c1cfc03d642c46db43f8cf", size = 352454, upload-time = "2025-05-18T19:03:14.741Z" }, { url = "https://files.pythonhosted.org/packages/96/4d/5c4e36d48f169a54b53a305114be3efa2bbffd33b648cd1478a688f639c1/jiter-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c404a99352d839fed80d6afd6c1d66071f3bacaaa5c4268983fc10f769112e90", size = 391833, upload-time = "2025-05-18T19:03:16.426Z" }, { url = "https://files.pythonhosted.org/packages/0b/de/ce4a6166a78810bd83763d2fa13f85f73cbd3743a325469a4a9289af6dae/jiter-0.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66e989410b6666d3ddb27a74c7e50d0829704ede652fd4c858e91f8d64b403d0", size = 523646, upload-time = "2025-05-18T19:03:17.704Z" }, { url = "https://files.pythonhosted.org/packages/a2/a6/3bc9acce53466972964cf4ad85efecb94f9244539ab6da1107f7aed82934/jiter-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b532d3af9ef4f6374609a3bcb5e05a1951d3bf6190dc6b176fdb277c9bbf15ee", size = 514735, upload-time = "2025-05-18T19:03:19.44Z" }, { url = "https://files.pythonhosted.org/packages/b4/d8/243c2ab8426a2a4dea85ba2a2ba43df379ccece2145320dfd4799b9633c5/jiter-0.10.0-cp310-cp310-win32.whl", hash = "sha256:da9be20b333970e28b72edc4dff63d4fec3398e05770fb3205f7fb460eb48dd4", size = 210747, upload-time = "2025-05-18T19:03:21.184Z" }, { url = "https://files.pythonhosted.org/packages/37/7a/8021bd615ef7788b98fc76ff533eaac846322c170e93cbffa01979197a45/jiter-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:f59e533afed0c5b0ac3eba20d2548c4a550336d8282ee69eb07b37ea526ee4e5", size = 207484, upload-time = "2025-05-18T19:03:23.046Z" }, { url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" }, { url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" }, { url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" }, { url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" }, { url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" }, { url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" }, { url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" }, { url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" }, { url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" }, { url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" }, { url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" }, { url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" }, { url = "https://files.pythonhosted.org/packages/6d/b5/348b3313c58f5fbfb2194eb4d07e46a35748ba6e5b3b3046143f3040bafa/jiter-0.10.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1e274728e4a5345a6dde2d343c8da018b9d4bd4350f5a472fa91f66fda44911b", size = 312262, upload-time = "2025-05-18T19:03:44.637Z" }, { url = "https://files.pythonhosted.org/packages/9c/4a/6a2397096162b21645162825f058d1709a02965606e537e3304b02742e9b/jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7202ae396446c988cb2a5feb33a543ab2165b786ac97f53b59aafb803fef0744", size = 320124, upload-time = "2025-05-18T19:03:46.341Z" }, { url = "https://files.pythonhosted.org/packages/2a/85/1ce02cade7516b726dd88f59a4ee46914bf79d1676d1228ef2002ed2f1c9/jiter-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23ba7722d6748b6920ed02a8f1726fb4b33e0fd2f3f621816a8b486c66410ab2", size = 345330, upload-time = "2025-05-18T19:03:47.596Z" }, { url = "https://files.pythonhosted.org/packages/75/d0/bb6b4f209a77190ce10ea8d7e50bf3725fc16d3372d0a9f11985a2b23eff/jiter-0.10.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:371eab43c0a288537d30e1f0b193bc4eca90439fc08a022dd83e5e07500ed026", size = 369670, upload-time = "2025-05-18T19:03:49.334Z" }, { url = "https://files.pythonhosted.org/packages/a0/f5/a61787da9b8847a601e6827fbc42ecb12be2c925ced3252c8ffcb56afcaf/jiter-0.10.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c675736059020365cebc845a820214765162728b51ab1e03a1b7b3abb70f74c", size = 489057, upload-time = "2025-05-18T19:03:50.66Z" }, { url = "https://files.pythonhosted.org/packages/12/e4/6f906272810a7b21406c760a53aadbe52e99ee070fc5c0cb191e316de30b/jiter-0.10.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c5867d40ab716e4684858e4887489685968a47e3ba222e44cde6e4a2154f959", size = 389372, upload-time = "2025-05-18T19:03:51.98Z" }, { url = "https://files.pythonhosted.org/packages/e2/ba/77013b0b8ba904bf3762f11e0129b8928bff7f978a81838dfcc958ad5728/jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395bb9a26111b60141757d874d27fdea01b17e8fac958b91c20128ba8f4acc8a", size = 352038, upload-time = "2025-05-18T19:03:53.703Z" }, { url = "https://files.pythonhosted.org/packages/67/27/c62568e3ccb03368dbcc44a1ef3a423cb86778a4389e995125d3d1aaa0a4/jiter-0.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6842184aed5cdb07e0c7e20e5bdcfafe33515ee1741a6835353bb45fe5d1bd95", size = 391538, upload-time = "2025-05-18T19:03:55.046Z" }, { url = "https://files.pythonhosted.org/packages/c0/72/0d6b7e31fc17a8fdce76164884edef0698ba556b8eb0af9546ae1a06b91d/jiter-0.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62755d1bcea9876770d4df713d82606c8c1a3dca88ff39046b85a048566d56ea", size = 523557, upload-time = "2025-05-18T19:03:56.386Z" }, { url = "https://files.pythonhosted.org/packages/2f/09/bc1661fbbcbeb6244bd2904ff3a06f340aa77a2b94e5a7373fd165960ea3/jiter-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:533efbce2cacec78d5ba73a41756beff8431dfa1694b6346ce7af3a12c42202b", size = 514202, upload-time = "2025-05-18T19:03:57.675Z" }, { url = "https://files.pythonhosted.org/packages/1b/84/5a5d5400e9d4d54b8004c9673bbe4403928a00d28529ff35b19e9d176b19/jiter-0.10.0-cp312-cp312-win32.whl", hash = "sha256:8be921f0cadd245e981b964dfbcd6fd4bc4e254cdc069490416dd7a2632ecc01", size = 211781, upload-time = "2025-05-18T19:03:59.025Z" }, { url = "https://files.pythonhosted.org/packages/9b/52/7ec47455e26f2d6e5f2ea4951a0652c06e5b995c291f723973ae9e724a65/jiter-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7c7d785ae9dda68c2678532a5a1581347e9c15362ae9f6e68f3fdbfb64f2e49", size = 206176, upload-time = "2025-05-18T19:04:00.305Z" }, { url = "https://files.pythonhosted.org/packages/2e/b0/279597e7a270e8d22623fea6c5d4eeac328e7d95c236ed51a2b884c54f70/jiter-0.10.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0588107ec8e11b6f5ef0e0d656fb2803ac6cf94a96b2b9fc675c0e3ab5e8644", size = 311617, upload-time = "2025-05-18T19:04:02.078Z" }, { url = "https://files.pythonhosted.org/packages/91/e3/0916334936f356d605f54cc164af4060e3e7094364add445a3bc79335d46/jiter-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cafc4628b616dc32530c20ee53d71589816cf385dd9449633e910d596b1f5c8a", size = 318947, upload-time = "2025-05-18T19:04:03.347Z" }, { url = "https://files.pythonhosted.org/packages/6a/8e/fd94e8c02d0e94539b7d669a7ebbd2776e51f329bb2c84d4385e8063a2ad/jiter-0.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520ef6d981172693786a49ff5b09eda72a42e539f14788124a07530f785c3ad6", size = 344618, upload-time = "2025-05-18T19:04:04.709Z" }, { url = "https://files.pythonhosted.org/packages/6f/b0/f9f0a2ec42c6e9c2e61c327824687f1e2415b767e1089c1d9135f43816bd/jiter-0.10.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:554dedfd05937f8fc45d17ebdf298fe7e0c77458232bcb73d9fbbf4c6455f5b3", size = 368829, upload-time = "2025-05-18T19:04:06.912Z" }, { url = "https://files.pythonhosted.org/packages/e8/57/5bbcd5331910595ad53b9fd0c610392ac68692176f05ae48d6ce5c852967/jiter-0.10.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bc299da7789deacf95f64052d97f75c16d4fc8c4c214a22bf8d859a4288a1c2", size = 491034, upload-time = "2025-05-18T19:04:08.222Z" }, { url = "https://files.pythonhosted.org/packages/9b/be/c393df00e6e6e9e623a73551774449f2f23b6ec6a502a3297aeeece2c65a/jiter-0.10.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5161e201172de298a8a1baad95eb85db4fb90e902353b1f6a41d64ea64644e25", size = 388529, upload-time = "2025-05-18T19:04:09.566Z" }, { url = "https://files.pythonhosted.org/packages/42/3e/df2235c54d365434c7f150b986a6e35f41ebdc2f95acea3036d99613025d/jiter-0.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2227db6ba93cb3e2bf67c87e594adde0609f146344e8207e8730364db27041", size = 350671, upload-time = "2025-05-18T19:04:10.98Z" }, { url = "https://files.pythonhosted.org/packages/c6/77/71b0b24cbcc28f55ab4dbfe029f9a5b73aeadaba677843fc6dc9ed2b1d0a/jiter-0.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:15acb267ea5e2c64515574b06a8bf393fbfee6a50eb1673614aa45f4613c0cca", size = 390864, upload-time = "2025-05-18T19:04:12.722Z" }, { url = "https://files.pythonhosted.org/packages/6a/d3/ef774b6969b9b6178e1d1e7a89a3bd37d241f3d3ec5f8deb37bbd203714a/jiter-0.10.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:901b92f2e2947dc6dfcb52fd624453862e16665ea909a08398dde19c0731b7f4", size = 522989, upload-time = "2025-05-18T19:04:14.261Z" }, { url = "https://files.pythonhosted.org/packages/0c/41/9becdb1d8dd5d854142f45a9d71949ed7e87a8e312b0bede2de849388cb9/jiter-0.10.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d0cb9a125d5a3ec971a094a845eadde2db0de85b33c9f13eb94a0c63d463879e", size = 513495, upload-time = "2025-05-18T19:04:15.603Z" }, { url = "https://files.pythonhosted.org/packages/9c/36/3468e5a18238bdedae7c4d19461265b5e9b8e288d3f86cd89d00cbb48686/jiter-0.10.0-cp313-cp313-win32.whl", hash = "sha256:48a403277ad1ee208fb930bdf91745e4d2d6e47253eedc96e2559d1e6527006d", size = 211289, upload-time = "2025-05-18T19:04:17.541Z" }, { url = "https://files.pythonhosted.org/packages/7e/07/1c96b623128bcb913706e294adb5f768fb7baf8db5e1338ce7b4ee8c78ef/jiter-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:75f9eb72ecb640619c29bf714e78c9c46c9c4eaafd644bf78577ede459f330d4", size = 205074, upload-time = "2025-05-18T19:04:19.21Z" }, { url = "https://files.pythonhosted.org/packages/54/46/caa2c1342655f57d8f0f2519774c6d67132205909c65e9aa8255e1d7b4f4/jiter-0.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:28ed2a4c05a1f32ef0e1d24c2611330219fed727dae01789f4a335617634b1ca", size = 318225, upload-time = "2025-05-18T19:04:20.583Z" }, { url = "https://files.pythonhosted.org/packages/43/84/c7d44c75767e18946219ba2d703a5a32ab37b0bc21886a97bc6062e4da42/jiter-0.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a4c418b1ec86a195f1ca69da8b23e8926c752b685af665ce30777233dfe070", size = 350235, upload-time = "2025-05-18T19:04:22.363Z" }, { url = "https://files.pythonhosted.org/packages/01/16/f5a0135ccd968b480daad0e6ab34b0c7c5ba3bc447e5088152696140dcb3/jiter-0.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d7bfed2fe1fe0e4dda6ef682cee888ba444b21e7a6553e03252e4feb6cf0adca", size = 207278, upload-time = "2025-05-18T19:04:23.627Z" }, { url = "https://files.pythonhosted.org/packages/1c/9b/1d646da42c3de6c2188fdaa15bce8ecb22b635904fc68be025e21249ba44/jiter-0.10.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:5e9251a5e83fab8d87799d3e1a46cb4b7f2919b895c6f4483629ed2446f66522", size = 310866, upload-time = "2025-05-18T19:04:24.891Z" }, { url = "https://files.pythonhosted.org/packages/ad/0e/26538b158e8a7c7987e94e7aeb2999e2e82b1f9d2e1f6e9874ddf71ebda0/jiter-0.10.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:023aa0204126fe5b87ccbcd75c8a0d0261b9abdbbf46d55e7ae9f8e22424eeb8", size = 318772, upload-time = "2025-05-18T19:04:26.161Z" }, { url = "https://files.pythonhosted.org/packages/7b/fb/d302893151caa1c2636d6574d213e4b34e31fd077af6050a9c5cbb42f6fb/jiter-0.10.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c189c4f1779c05f75fc17c0c1267594ed918996a231593a21a5ca5438445216", size = 344534, upload-time = "2025-05-18T19:04:27.495Z" }, { url = "https://files.pythonhosted.org/packages/01/d8/5780b64a149d74e347c5128d82176eb1e3241b1391ac07935693466d6219/jiter-0.10.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15720084d90d1098ca0229352607cd68256c76991f6b374af96f36920eae13c4", size = 369087, upload-time = "2025-05-18T19:04:28.896Z" }, { url = "https://files.pythonhosted.org/packages/e8/5b/f235a1437445160e777544f3ade57544daf96ba7e96c1a5b24a6f7ac7004/jiter-0.10.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4f2fb68e5f1cfee30e2b2a09549a00683e0fde4c6a2ab88c94072fc33cb7426", size = 490694, upload-time = "2025-05-18T19:04:30.183Z" }, { url = "https://files.pythonhosted.org/packages/85/a9/9c3d4617caa2ff89cf61b41e83820c27ebb3f7b5fae8a72901e8cd6ff9be/jiter-0.10.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce541693355fc6da424c08b7edf39a2895f58d6ea17d92cc2b168d20907dee12", size = 388992, upload-time = "2025-05-18T19:04:32.028Z" }, { url = "https://files.pythonhosted.org/packages/68/b1/344fd14049ba5c94526540af7eb661871f9c54d5f5601ff41a959b9a0bbd/jiter-0.10.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c50c40272e189d50006ad5c73883caabb73d4e9748a688b216e85a9a9ca3b9", size = 351723, upload-time = "2025-05-18T19:04:33.467Z" }, { url = "https://files.pythonhosted.org/packages/41/89/4c0e345041186f82a31aee7b9d4219a910df672b9fef26f129f0cda07a29/jiter-0.10.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fa3402a2ff9815960e0372a47b75c76979d74402448509ccd49a275fa983ef8a", size = 392215, upload-time = "2025-05-18T19:04:34.827Z" }, { url = "https://files.pythonhosted.org/packages/55/58/ee607863e18d3f895feb802154a2177d7e823a7103f000df182e0f718b38/jiter-0.10.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:1956f934dca32d7bb647ea21d06d93ca40868b505c228556d3373cbd255ce853", size = 522762, upload-time = "2025-05-18T19:04:36.19Z" }, { url = "https://files.pythonhosted.org/packages/15/d0/9123fb41825490d16929e73c212de9a42913d68324a8ce3c8476cae7ac9d/jiter-0.10.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86", size = 513427, upload-time = "2025-05-18T19:04:37.544Z" }, { url = "https://files.pythonhosted.org/packages/d8/b3/2bd02071c5a2430d0b70403a34411fc519c2f227da7b03da9ba6a956f931/jiter-0.10.0-cp314-cp314-win32.whl", hash = "sha256:ac509f7eccca54b2a29daeb516fb95b6f0bd0d0d8084efaf8ed5dfc7b9f0b357", size = 210127, upload-time = "2025-05-18T19:04:38.837Z" }, { url = "https://files.pythonhosted.org/packages/03/0c/5fe86614ea050c3ecd728ab4035534387cd41e7c1855ef6c031f1ca93e3f/jiter-0.10.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ed975b83a2b8639356151cef5c0d597c68376fc4922b45d0eb384ac058cfa00", size = 318527, upload-time = "2025-05-18T19:04:40.612Z" }, { url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },]
[[package]]name = "llmscraper"version = "0.1.0"source = { virtual = "." }dependencies = [ { name = "anthropic" }, { name = "apify-client" },]
[package.metadata]requires-dist = [ { name = "anthropic", specifier = ">=0.54.0" }, { name = "apify-client", specifier = ">=1.11.0" },]
[[package]]name = "more-itertools"version = "10.7.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" },]
[[package]]name = "pydantic"version = "2.11.7"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "annotated-types" }, { name = "pydantic-core" }, { name = "typing-extensions" }, { name = "typing-inspection" },]sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },]
[[package]]name = "pydantic-core"version = "2.33.2"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" }, { url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" }, { url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" }, { url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" }, { url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" }, { url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" }, { url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" }, { url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" }, { url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" }, { url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" }, { url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" }, { url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" }, { url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" }, { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" }, { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" }, { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" }, { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" }, { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" }, { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" }, { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" }, { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" }, { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" }, { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" }, { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" }, { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" }, { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" }, { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" }, { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" }, { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" }, { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" }, { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" }, { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" }, { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" }, { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" }, { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" }, { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" }, { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" }, { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" }, { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" }, { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" }, { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" }, { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" }, { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" }, { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" }, { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" }, { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" }, { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" }, { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" }, { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" }, { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" }, { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" }, { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" }, { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, { url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" }, { url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" }, { url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" }, { url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" }, { url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" }, { url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" }, { url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" }, { url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" }, { url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" }, { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" }, { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" }, { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" }, { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" }, { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" }, { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" }, { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },]
[[package]]name = "sniffio"version = "1.3.1"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },]
[[package]]name = "typing-extensions"version = "4.14.0"source = { registry = "https://pypi.org/simple" }sdist = { url = "https://files.pythonhosted.org/packages/d1/bc/51647cd02527e87d05cb083ccc402f93e441606ff1f01739a62c8ad09ba5/typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4", size = 107423, upload-time = "2025-06-02T14:52:11.399Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af", size = 43839, upload-time = "2025-06-02T14:52:10.026Z" },]
[[package]]name = "typing-inspection"version = "0.4.1"source = { registry = "https://pypi.org/simple" }dependencies = [ { name = "typing-extensions" },]sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }wheels = [ { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },]
.actor/README.md
1# ScraperCodeGenerator Actor2
3An intelligent web scraping actor that uses AI-powered quality evaluation and multiple scraping strategies to extract data from websites.4
5## Features6
7- **AI-Powered Quality Evaluation**: Uses Claude AI to evaluate the quality of extracted data8- **Multiple Scraping Strategies**: Employs different scraping techniques for optimal results9- **Intelligent Pipeline**: Automatically selects the best scraping approach based on the website structure10
11## Input12
13The actor expects the following input parameters:14
15- **Target URL** (required): The URL of the website to scrape16- **User Goal** (required): Description of what data you want to extract (e.g., "Extract product names, prices, and descriptions")17- **Claude API Key** (required): Your Anthropic Claude API key for AI-powered evaluation18
19## Output20
21The actor returns extracted data in a structured format based on your specified goal. Results are automatically saved to the default dataset.22
23## Usage24
251. Provide the target URL you want to scrape262. Describe what data you want to extract in the "User Goal" field273. Enter your Claude API key284. Run the actor29
30The actor will intelligently analyze the website, extract the requested data, and evaluate the quality of the results using AI.31
32## Example Input33
34```json35{36 "targetUrl": "https://example-store.com/products",37 "userGoal": "Extract product names, prices, and availability status",38 "claudeApiKey": "sk-ant-..."39}40```41
42## Requirements43
44- Valid Claude API key from Anthropic45- Target website must be accessible46- Sufficient Apify compute units for the scraping operation
.actor/actor.json
{ "actorSpecification": 1, "name": "ScraperCodeGenerator", "version": "0.1", "buildTag": "latest", "environmentVariables": { "CLAUDE_API_KEY": "@claudeApiKey" }, "dockerfile": "./Dockerfile", "input": { "title": "Intelligent Scraper Settings", "description": "AI-powered web scraper that generates custom scraping code based on your requirements", "type": "object", "schemaVersion": 1, "properties": { "targetUrl": { "title": "Target URL", "description": "The URL of the website you want to scrape", "type": "string", "editor": "textfield", "prefill": "https://books.toscrape.com/" }, "userGoal": { "title": "Scraping Goal", "description": "Describe what data you want to extract from the website", "type": "string", "editor": "textarea", "prefill": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock." }, "claudeApiKey": { "title": "Claude API Key", "description": "Your Anthropic Claude API key for AI-powered code generation", "type": "string", "editor": "textfield", "isSecret": true }, "maxRetries": { "title": "Max Retries", "description": "Maximum number of retry attempts for scraping", "type": "integer", "minimum": 1, "maximum": 10, "default": 3 }, "timeout": { "title": "Timeout (seconds)", "description": "Timeout for each scraping attempt in seconds", "type": "integer", "minimum": 10, "maximum": 300, "default": 60 } }, "required": ["targetUrl", "userGoal", "claudeApiKey"] }, "storages": { "dataset": { "actorSpecification": 1, "views": { "scraped_data": { "title": "Scraped Data", "transformation": {}, "display": { "component": "table", "properties": { "url": { "label": "Source URL", "format": "link" }, "title": { "label": "Page Title", "format": "text" }, "data": { "label": "Extracted Data", "format": "object" }, "timestamp": { "label": "Scraped At", "format": "datetime" }, "success": { "label": "Success", "format": "boolean" }, "error": { "label": "Error Message", "format": "text" } } } } } } }}
quality_eval/claude_client.py
1
quality_eval/html_quality_evaluator.py
1"""2HTML Quality Evaluator Module3
4This module provides functionality to evaluate the quality of HTML documents 5fetched by automated browsers. It determines if the HTML is suitable for data 6extraction or if it represents a failure page (e.g., CAPTCHA, block page, etc.).7
8Uses Claude AI to perform intelligent analysis based on the user's extraction goals.9"""10
11import json12import logging13import re14from typing import Dict, Any, Optional15from dataclasses import dataclass16
17import anthropic18
19
20@dataclass21class EvaluationResult:22 """Result of HTML quality evaluation."""23 score: int # 1-10 scale24 reasoning: str25
26
27@dataclass28class PreEvaluationResult:29 """Result of pre-evaluation checks before sending to Claude."""30 is_valid_html: bool31 score: Optional[int] = None # If we can determine score without Claude32 reasoning: Optional[str] = None33 should_continue_to_claude: bool = True34
35
36class HTMLQualityEvaluator:37 """38 Evaluates HTML quality for web scraping using Claude AI.39 40 This class provides methods to analyze HTML content and determine41 if it's suitable for data extraction based on the user's goals.42 """43 44 def __init__(self, claude_api_key: str):45 """46 Initialize the HTML Quality Evaluator.47 48 Args:49 claude_api_key: Anthropic API key for Claude access50 """51 if not claude_api_key or not claude_api_key.strip():52 raise ValueError("claude_api_key cannot be empty")53 54 self.client = anthropic.Anthropic(api_key=claude_api_key)55 self.logger = logging.getLogger(__name__)56 57 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:58 """59 Perform basic validation checks on HTML content before sending to Claude.60 61 This method checks for obvious issues that don't require AI analysis:62 - Whether content is actually HTML63 - Empty or whitespace-only content64 65 Args:66 html_content: The HTML content to pre-evaluate67 68 Returns:69 PreEvaluationResult indicating if validation passed and whether to continue70 """71 if not html_content or not html_content.strip():72 return PreEvaluationResult(73 is_valid_html=False,74 score=1,75 reasoning="Content is empty or contains only whitespace",76 should_continue_to_claude=False77 )78 79 content = html_content.strip()80 content_lower = content.lower()81 82 # Check if content looks like HTML at all83 has_opening_closing_tags = '<' in content and '>' in content84 85 if not has_opening_closing_tags:86 return PreEvaluationResult(87 is_valid_html=False,88 score=1,89 reasoning="Content does not contain HTML tags",90 should_continue_to_claude=False91 )92 93 # Check for obvious non-HTML content94 if content.startswith('{') and content.endswith('}'):95 # Looks like JSON96 return PreEvaluationResult(97 is_valid_html=False,98 score=1,99 reasoning="Content appears to be JSON, not HTML",100 should_continue_to_claude=False101 )102 103 if content.startswith('<?xml'):104 # XML but not HTML105 return PreEvaluationResult(106 is_valid_html=False,107 score=1,108 reasoning="Content appears to be XML, not HTML",109 should_continue_to_claude=False110 )111 112 # If it looks like valid HTML, let Claude evaluate it113 return PreEvaluationResult(114 is_valid_html=True,115 should_continue_to_claude=True116 )117 118 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:119 """120 Create the prompt for Claude to evaluate HTML quality.121 122 Args:123 user_goal: The user's data extraction goal124 html_content: The HTML content to evaluate125 126 Returns:127 Formatted prompt for Claude128 """129 return f"""You are a QA automation engineer specializing in web scraping. Your task is to evaluate the quality of an HTML document that was fetched by an automated browser. The goal is to determine if the HTML is suitable for data extraction or if it looks like a failure page (e.g., a CAPTCHA, block page, or empty loading page).130
131You will be given the user's original data extraction goal and the full HTML content from a scraping attempt.132
133REQUIREMENTS:134- Analysis: Analyze the provided HTML for common indicators of success or failure.135- Success Indicators: Meaningful content in the <body>, presence of <h1>-<h6> tags, lists (<ul>, <ol>), tables (<table>), structured <div>s with descriptive class names.136- Failure Indicators: Common keywords like "CAPTCHA", "bot detection", "enable JavaScript", "access denied", "rate limit". An empty or near-empty <body> tag. A structure that looks like a loading spinner.137- Output Format: Your response MUST be a single, well-formed JSON object with the following two keys:138 "score": An integer from 1 (unusable) to 10 (perfectly rendered and content-rich).139 "reasoning": A brief, one-sentence explanation for your score.140- Contextual Awareness: Use the "User's Goal" to inform your analysis. If the user wants "product listings" and you see a grid of items, that's a high score. If you see a login form, that's a low score.141
142[INPUT 1] User's Goal:143{user_goal}144
145[INPUT 2] HTML Content from an Actor Run:146{html_content}147
148Please analyze the HTML and provide your evaluation as a JSON object."""149
150 def _parse_claude_response(self, response_text: str) -> Optional[EvaluationResult]:151 """152 Parse Claude's response and extract the evaluation result.153 154 Args:155 response_text: Raw response from Claude156 157 Returns:158 EvaluationResult if parsing successful, None otherwise159 """160 try:161 # Try to find JSON in the response162 json_match = re.search(r'\{[^}]*"score"[^}]*\}', response_text, re.DOTALL)163 if not json_match:164 self.logger.error("No JSON object found in Claude's response")165 return None166 167 json_str = json_match.group(0)168 result_dict = json.loads(json_str)169 170 # Validate required fields171 if 'score' not in result_dict or 'reasoning' not in result_dict:172 self.logger.error("Missing required fields in Claude's response")173 return None174 175 score = int(result_dict['score'])176 if not (1 <= score <= 10):177 self.logger.error(f"Score {score} is outside valid range 1-10")178 return None179 180 return EvaluationResult(181 score=score,182 reasoning=result_dict['reasoning']183 )184 185 except (json.JSONDecodeError, ValueError, KeyError) as e:186 self.logger.error(f"Failed to parse Claude's response: {e}")187 return None188 189 def _preprocess_html(self, html_content: str) -> str:190 """191 Preprocess HTML content for analysis.192 193 Args:194 html_content: Raw HTML content195 196 Returns:197 Preprocessed HTML content198 """199 # Truncate very long HTML to avoid token limits200 max_length = 100000 # Adjust based on Claude's token limits201 if len(html_content) > max_length:202 self.logger.warning(f"HTML content truncated from {len(html_content)} to {max_length} characters")203 html_content = html_content[:max_length] + "\n<!-- Content truncated for analysis -->"204 205 return html_content206 207 def get_pre_evaluation_info(self, html_content: str) -> PreEvaluationResult:208 """209 Get detailed pre-evaluation information without performing the full evaluation.210 211 This method allows users to see what the pre-evaluation checks detected212 without calling Claude if they just want to understand the basic validation.213 214 Args:215 html_content: The HTML content to pre-evaluate216 217 Returns:218 PreEvaluationResult with detailed validation information219 """220 return self._pre_evaluate_html(html_content)221 222 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:223 """224 Evaluate the quality of HTML content for data extraction.225 226 This is the main function that analyzes HTML content to determine if it's 227 suitable for data extraction based on the user's goals.228 229 The evaluation process consists of two stages:230 1. Pre-evaluation: Basic validation checks for HTML structure, error pages, 231 and obvious content issues that don't require AI analysis232 2. Claude AI analysis: Detailed content evaluation using AI if pre-evaluation passes233 234 Args:235 user_goal: The user's data extraction goal/objective236 html_content: The HTML content to evaluate237 238 Returns:239 EvaluationResult containing score (1-10) and reasoning, or None if evaluation failed240 241 Example:242 >>> evaluator = HTMLQualityEvaluator("your-claude-api-key")243 >>> result = evaluator.evaluate_html_quality(244 ... "I want to get a list of all articles on the homepage",245 ... "<html><body><article>...</article></body></html>"246 ... )247 >>> if result:248 ... print(f"Score: {result.score}, Reasoning: {result.reasoning}")249 """250 if not user_goal or not user_goal.strip():251 raise ValueError("user_goal cannot be empty")252 253 # Perform pre-evaluation checks on the HTML content254 self.logger.info("Starting pre-evaluation checks on HTML content")255 pre_eval_result = self._pre_evaluate_html(html_content)256 257 if not pre_eval_result.should_continue_to_claude:258 self.logger.info(f"Pre-evaluation completed without Claude - Score: {pre_eval_result.score}")259 return EvaluationResult(260 score=pre_eval_result.score,261 reasoning=pre_eval_result.reasoning262 )263 264 # Log pre-evaluation results265 if not pre_eval_result.is_valid_html:266 self.logger.warning("Pre-evaluation detected non-HTML content, but continuing to Claude")267 else:268 self.logger.info("Pre-evaluation passed, proceeding to Claude analysis")269 270 try:271 # Preprocess HTML content272 processed_html = self._preprocess_html(html_content)273 274 # Create the evaluation prompt275 prompt = self._create_evaluation_prompt(user_goal, processed_html)276 277 self.logger.info("Sending HTML evaluation request to Claude")278 279 # Send request to Claude280 response = self.client.messages.create(281 model="claude-3-5-sonnet-20241022", # TODO use 4282 max_tokens=1000,283 messages=[284 {285 "role": "user",286 "content": prompt287 }288 ]289 )290 291 # Extract and parse the response292 response_text = response.content[0].text293 result = self._parse_claude_response(response_text)294 295 if result:296 self.logger.info(f"HTML evaluation completed - Score: {result.score}")297 else:298 self.logger.error("Failed to parse evaluation result from Claude")299 300 return result301 302 except anthropic.APIError as e:303 self.logger.error(f"Claude API error: {e}")304 return None305 except Exception as e:306 self.logger.error(f"Unexpected error during HTML evaluation: {e}", exc_info=True)307 return None
manual_scraping/apify_runner.py
1"""2Apify Actor Runner Module3
4This module provides functionality to run Apify actors and retrieve their results5using the official Apify Python client library.6"""7
8import logging9from typing import Optional, Dict, Any, List, Union10
11from apify_client import ApifyClient12
13
14def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:15 """16 Run an Apify actor and retrieve its dataset results.17 18 This function is maintained for backward compatibility. For more flexible19 data retrieval (including key-value stores), use run_apify_actor_with_flexible_retrieval.20 21 Args:22 actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')23 actor_input: Dictionary containing the input configuration for the actor24 api_token: Apify API token for authentication (keyword-only argument)25 26 Returns:27 List of dictionaries containing the dataset items if successful, None otherwise.28 Returns an empty list if the run succeeds but produces no data.29 30 Raises:31 ValueError: If actor_id or api_token is empty32 33 Example:34 >>> input_data = {"startUrls": [{"url": "https://example.com"}]}35 >>> results = run_apify_actor("apify/web-scraper", input_data, api_token="your_token")36 >>> if results is not None:37 ... print(f"Scraped {len(results)} items")38 """39 result = run_apify_actor_with_flexible_retrieval(40 actor_id=actor_id,41 actor_input=actor_input,42 api_token=api_token,43 retrieve_from="dataset"44 )45 46 # Ensure we return a list for backward compatibility47 if isinstance(result, list):48 return result49 else:50 return None51
52
53def run_apify_actor_with_flexible_retrieval(54 actor_id: str, 55 actor_input: dict, 56 *, 57 api_token: str,58 retrieve_from: str = "auto"59) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:60 """61 Run an Apify actor and retrieve its results from dataset or key-value store.62 63 This function starts an Apify actor with the provided input, waits for the run64 to complete, and returns data from either the dataset or key-value store based65 on the actor type or explicit configuration.66 67 Args:68 actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')69 actor_input: Dictionary containing the input configuration for the actor70 api_token: Apify API token for authentication (keyword-only argument)71 retrieve_from: Where to retrieve data from. Options:72 - "auto": Automatically detect based on actor_id73 - "dataset": Retrieve from dataset only74 - "key-value-store": Retrieve from key-value store only75 - "both": Retrieve from both and return combined results76 77 Returns:78 - If retrieve_from is "dataset": List of dictionaries from dataset79 - If retrieve_from is "key-value-store": Dictionary with key-value store items80 - If retrieve_from is "both": Dictionary with 'dataset' and 'key_value_store' keys81 - If retrieve_from is "auto": Appropriate format based on actor type82 Returns None if the run fails.83 84 Raises:85 ValueError: If actor_id, api_token is empty, or retrieve_from is invalid86 87 Example:88 >>> input_data = {"startUrls": [{"url": "https://example.com"}]}89 >>> # Auto-detect storage type90 >>> results = run_apify_actor_with_flexible_retrieval(91 ... "apify/website-content-crawler", 92 ... input_data, 93 ... api_token="your_token"94 ... )95 >>> # Explicitly use key-value store96 >>> results = run_apify_actor_with_flexible_retrieval(97 ... "apify/web-scraper", 98 ... input_data, 99 ... api_token="your_token",100 ... retrieve_from="key-value-store"101 ... )102 """103 # Input validation104 if not actor_id or not actor_id.strip():105 raise ValueError("actor_id cannot be empty")106 if not api_token or not api_token.strip():107 raise ValueError("api_token cannot be empty")108 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:109 raise ValueError("retrieve_from must be 'auto', 'dataset', 'key-value-store', or 'both'")110 111 # Configure logging112 logger = logging.getLogger(__name__)113 114 # Determine storage type based on actor if auto mode115 if retrieve_from == "auto":116 if "website-content-crawler" in actor_id:117 retrieve_from = "key-value-store"118 else:119 retrieve_from = "dataset"120 121 try:122 # Initialize the Apify client123 client = ApifyClient(api_token)124 125 logger.info(f"Starting Apify actor: {actor_id}")126 127 # Start the actor run128 run = client.actor(actor_id).call(run_input=actor_input)129 130 # Check if the run was created successfully131 if not run:132 logger.error(f"Failed to start actor run for {actor_id}")133 return None134 135 run_id = run.get('id')136 status = run.get('status')137 138 logger.info(f"Actor run started with ID: {run_id}, Status: {status}")139 140 # Check the final status of the run141 if status != 'SUCCEEDED':142 logger.warning(143 f"Actor run {run_id} did not succeed. "144 f"Final status: {status}. "145 f"Exit code: {run.get('exitCode', 'N/A')}"146 )147 return None148 149 logger.info(f"Actor run {run_id} completed successfully")150 151 # Retrieve data based on the specified method152 if retrieve_from == "dataset":153 dataset_client = client.dataset(run.get('defaultDatasetId'))154 dataset_items = list(dataset_client.iterate_items())155 logger.info(f"Retrieved {len(dataset_items)} items from dataset")156 return dataset_items157 158 elif retrieve_from == "key-value-store":159 kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))160 161 # List all keys in the key-value store162 keys_response = kv_store_client.list_keys()163 keys = [item['key'] for item in keys_response.get('items', [])]164 165 if not keys:166 logger.warning("No keys found in key-value store")167 return {}168 169 # Retrieve all key-value pairs170 kv_items = {}171 for key in keys:172 try:173 value = kv_store_client.get_record(key)174 if value:175 kv_items[key] = value.get('value')176 except Exception as e:177 logger.warning(f"Failed to retrieve key '{key}': {str(e)}")178 179 logger.info(f"Retrieved {len(kv_items)} items from key-value store")180 return kv_items181 182 elif retrieve_from == "both":183 # Retrieve from both sources184 results = {"dataset": [], "key_value_store": {}}185 186 # Get dataset items187 try:188 dataset_client = client.dataset(run.get('defaultDatasetId'))189 dataset_items = list(dataset_client.iterate_items())190 results["dataset"] = dataset_items191 logger.info(f"Retrieved {len(dataset_items)} items from dataset")192 except Exception as e:193 logger.warning(f"Failed to retrieve dataset items: {str(e)}")194 195 # Get key-value store items196 try:197 kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))198 keys_response = kv_store_client.list_keys()199 keys = [item['key'] for item in keys_response.get('items', [])]200 201 kv_items = {}202 for key in keys:203 try:204 value = kv_store_client.get_record(key)205 if value:206 kv_items[key] = value.get('value')207 except Exception as e:208 logger.warning(f"Failed to retrieve key '{key}': {str(e)}")209 210 results["key_value_store"] = kv_items211 logger.info(f"Retrieved {len(kv_items)} items from key-value store")212 except Exception as e:213 logger.warning(f"Failed to retrieve key-value store items: {str(e)}")214 215 return results216 217 except Exception as e:218 logger.error(f"Error running Apify actor {actor_id}: {str(e)}", exc_info=True)219 return None
manual_scraping/multi_actor_scraper.py
1"""2Multi-Actor Website Scraper3
4This module provides functionality to scrape websites using multiple Apify actors5simultaneously and return a dictionary of actor-scraped HTML pairs.6"""7
8import asyncio9import logging10from typing import Dict, List, Optional, Any11from concurrent.futures import ThreadPoolExecutor, as_completed12import json13
14from .apify_runner import run_apify_actor_with_flexible_retrieval, run_apify_actor15
16
17class MultiActorScraper:18 """19 A class to scrape websites using multiple Apify actors simultaneously.20 """21 22 def __init__(self, api_token: str):23 """24 Initialize the MultiActorScraper.25 26 Args:27 api_token: Apify API token for authentication28 """29 self.api_token = api_token30 self.logger = logging.getLogger(__name__)31 32 # Configure logging33 logging.basicConfig(34 level=logging.INFO,35 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'36 )37 38 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:39 """40 Get the configuration for each actor with the target URL.41 42 Args:43 target_url: The URL to scrape44 45 Returns:46 Dictionary mapping actor names to their configurations47 """48 return {49 "cheerio-scraper": {50 "actor_id": "apify/cheerio-scraper",51 "input": {52 "debugLog": False,53 "forceResponseEncoding": False,54 "ignoreSslErrors": False,55 "keepUrlFragments": False,56 "pageFunction": """async function pageFunction(context) {57 const { $, request, log } = context;58
59 const url = request.url;60 const html = $.html(); // Get the full HTML of the page61
62 log.info('Page scraped', { url });63
64 return {65 url,66 html67 };68}""",69 "postNavigationHooks": """// We need to return array of (possibly async) functions here.70// The functions accept a single argument: the "crawlingContext" object.71[72 async (crawlingContext) => {73 // ...74 },75]""",76 "preNavigationHooks": """// We need to return array of (possibly async) functions here.77// The functions accept two arguments: the "crawlingContext" object78// and "requestAsBrowserOptions" which are passed to the `requestAsBrowser()`79// function the crawler calls to navigate..80[81 async (crawlingContext, requestAsBrowserOptions) => {82 // ...83 }84]""",85 "proxyConfiguration": {86 "useApifyProxy": True87 },88 "respectRobotsTxtFile": False,89 "startUrls": [90 {91 "url": target_url,92 "method": "GET"93 }94 ]95 }96 },97 98 "website-content-crawler": {99 "actor_id": "apify/website-content-crawler",100 "input": {101 "aggressivePrune": False,102 "clickElementsCssSelector": "[aria-expanded=\"false\"]",103 "clientSideMinChangePercentage": 15,104 "crawlerType": "playwright:adaptive",105 "debugLog": False,106 "debugMode": False,107 "dynamicContentWaitSecs": 15,108 "expandIframes": True,109 "ignoreCanonicalUrl": False,110 "keepUrlFragments": False,111 "maxCrawlDepth": 0,112 "proxyConfiguration": {113 "useApifyProxy": True,114 "apifyProxyGroups": [115 "RESIDENTIAL"116 ]117 },118 "readableTextCharThreshold": 100,119 "removeCookieWarnings": True,120 "renderingTypeDetectionPercentage": 10,121 "respectRobotsTxtFile": False,122 "saveFiles": False,123 "saveHtml": False,124 "saveHtmlAsFile": True,125 "saveMarkdown": False,126 "saveScreenshots": False,127 "startUrls": [128 {129 "url": target_url,130 "method": "GET"131 }132 ],133 "useSitemaps": False134 }135 },136 137 "web-scraper": {138 "actor_id": "apify/web-scraper",139 "input": {140 "breakpointLocation": "NONE",141 "browserLog": False,142 "closeCookieModals": False,143 "debugLog": False,144 "downloadCss": True,145 "downloadMedia": True,146 "headless": True,147 "ignoreCorsAndCsp": False,148 "ignoreSslErrors": False,149 "injectJQuery": True,150 "keepUrlFragments": False,151 "pageFunction": """// The function accepts a single argument: the "context" object.152// For a complete list of its properties and functions,153// see https://apify.com/apify/web-scraper#page-function 154async function pageFunction(context) {155 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!156 // debugger; 157
158 // jQuery is handy for finding DOM elements and extracting data from them.159 // To use it, make sure to enable the "Inject jQuery" option.160 const $ = context.jQuery;161 const url = context.request.url;162 const html = $('html').html(); // Get the full HTML of the page163
164 // Print some information to Actor log165 context.log.info(`URL: ${url}, HTML length: ${html ? html.length : 0}`);166
167 // Return an object with the data extracted from the page.168 // It will be stored to the resulting dataset.169 return {170 url,171 html172 };173}""",174 "postNavigationHooks": """// We need to return array of (possibly async) functions here.175// The functions accept a single argument: the "crawlingContext" object.176[177 async (crawlingContext) => {178 // ...179 },180]""",181 "preNavigationHooks": """// We need to return array of (possibly async) functions here.182// The functions accept two arguments: the "crawlingContext" object183// and "gotoOptions".184[185 async (crawlingContext, gotoOptions) => {186 // ...187 },188]""",189 "proxyConfiguration": {190 "useApifyProxy": True191 },192 "respectRobotsTxtFile": False,193 "runMode": "PRODUCTION",194 "startUrls": [195 {196 "url": target_url,197 "method": "GET"198 }199 ],200 "useChrome": False,201 "waitUntil": [202 "networkidle2"203 ]204 }205 }206 }207 208 def _run_single_actor(self, actor_name: str, actor_config: Dict[str, Any]) -> tuple[str, Optional[str]]:209 """210 Run a single actor and extract HTML content.211 212 Args:213 actor_name: Name of the actor214 actor_config: Configuration for the actor215 216 Returns:217 Tuple of (actor_name, html_content or None)218 """219 try:220 self.logger.info(f"Starting {actor_name}...")221 222 # Determine retrieval method based on actor type223 if "website-content-crawler" in actor_config["actor_id"]:224 retrieve_from = "key-value-store"225 else:226 retrieve_from = "dataset"227 228 results = run_apify_actor_with_flexible_retrieval(229 actor_config["actor_id"],230 actor_config["input"],231 api_token=self.api_token,232 retrieve_from=retrieve_from233 )234 235 if not results:236 self.logger.warning(f"{actor_name} returned no results")237 return actor_name, None238 239 # Extract HTML from results240 html_content = None241 242 # Handle different result formats243 if isinstance(results, list):244 # Dataset results (list format)245 for item in results:246 if isinstance(item, dict):247 # For cheerio-scraper and web-scraper248 if 'html' in item:249 html_content = item['html']250 break251 # For other actors that might use different keys252 elif 'text' in item:253 html_content = item['text']254 break255 elif 'content' in item:256 html_content = item['content']257 break258 # If it's a string (sometimes the whole result is HTML)259 elif isinstance(item, str):260 html_content = item261 break262 263 elif isinstance(results, dict):264 # Key-value store results (dict format) - for website-content-crawler265 # Look for HTML files in the key-value store266 for key, value in results.items():267 if key.endswith('.html') or 'html' in key.lower():268 if isinstance(value, str):269 html_content = value270 break271 elif isinstance(value, dict) and 'value' in value:272 html_content = value['value']273 break274 275 # If no HTML file found, look for other common keys276 if not html_content:277 for key, value in results.items():278 if isinstance(value, str) and len(value) > 100: # Likely HTML content279 html_content = value280 break281 elif isinstance(value, dict):282 # Check if the value dict contains HTML283 if 'html' in value:284 html_content = value['html']285 break286 elif 'content' in value:287 html_content = value['content']288 break289 290 if html_content:291 self.logger.info(f"{actor_name} completed successfully - HTML length: {len(html_content)}")292 else:293 self.logger.warning(f"{actor_name} completed but no HTML content found in results")294 # Log the structure of the first result for debugging295 if isinstance(results, list) and results:296 self.logger.debug(f"{actor_name} result structure: {list(results[0].keys()) if isinstance(results[0], dict) else type(results[0])}")297 elif isinstance(results, dict):298 self.logger.debug(f"{actor_name} key-value store keys: {list(results.keys())}")299 300 return actor_name, html_content301 302 except Exception as e:303 self.logger.error(f"Error running {actor_name}: {str(e)}", exc_info=True)304 return actor_name, None305 306 def scrape_with_multiple_actors(self, target_url: str, max_workers: int = 4) -> Dict[str, Optional[str]]:307 """308 Scrape a website using multiple Apify actors simultaneously.309 310 Args:311 target_url: The URL to scrape312 max_workers: Maximum number of concurrent workers313 314 Returns:315 Dictionary mapping actor names to their scraped HTML content316 """317 self.logger.info(f"Starting multi-actor scraping for URL: {target_url}")318 319 actor_configs = self._get_actor_configs(target_url)320 results = {}321 322 with ThreadPoolExecutor(max_workers=max_workers) as executor:323 # Submit all actor runs324 future_to_actor = {325 executor.submit(self._run_single_actor, actor_name, config): actor_name326 for actor_name, config in actor_configs.items()327 }328 329 # Collect results as they complete330 for future in as_completed(future_to_actor):331 actor_name = future_to_actor[future]332 try:333 actor_name_result, html_content = future.result()334 results[actor_name_result] = html_content335 336 except Exception as e:337 self.logger.error(f"Error getting result for {actor_name}: {str(e)}")338 results[actor_name] = None339 340 # Log summary341 successful_actors = [name for name, content in results.items() if content is not None]342 failed_actors = [name for name, content in results.items() if content is None]343 344 self.logger.info(f"Scraping completed!")345 self.logger.info(f"Successful actors: {successful_actors}")346 if failed_actors:347 self.logger.warning(f"Failed actors: {failed_actors}")348 349 return results350 351 def save_results_to_files(self, results: Dict[str, Optional[str]], output_dir: str = "scraped_results") -> None:352 """353 Save the scraped HTML results to separate files.354 355 Args:356 results: Dictionary of actor-HTML pairs357 output_dir: Directory to save the files358 """359 import os360 361 if not os.path.exists(output_dir):362 os.makedirs(output_dir)363 364 for actor_name, html_content in results.items():365 if html_content:366 filename = f"{actor_name.replace('/', '_')}_result.html"367 filepath = os.path.join(output_dir, filename)368 369 with open(filepath, 'w', encoding='utf-8') as f:370 f.write(html_content)371 372 self.logger.info(f"Saved {actor_name} result to {filepath}")373 else:374 self.logger.warning(f"No content to save for {actor_name}")375 376 def scrape_with_single_actor_flexible(target_url: str, actor_id: str, api_token: str, 377 custom_input: Optional[Dict[str, Any]] = None) -> Optional[str]:378 """379 Convenience function to scrape a website using a single Apify actor with flexible retrieval.380 381 This function automatically detects whether to use dataset or key-value store based on the actor type.382 383 Args:384 target_url: The URL to scrape385 actor_id: The Apify actor ID (e.g., 'apify/website-content-crawler')386 api_token: Apify API token387 custom_input: Optional custom input configuration for the actor388 389 Returns:390 HTML content as string if successful, None otherwise391 392 Example:393 >>> # For website-content-crawler (uses key-value store)394 >>> html = scrape_with_single_actor_flexible(395 ... "https://example.com", 396 ... "apify/website-content-crawler", 397 ... api_token398 ... )399 >>> # For web-scraper (uses dataset)400 >>> html = scrape_with_single_actor_flexible(401 ... "https://example.com", 402 ... "apify/web-scraper", 403 ... api_token404 ... )405 """406 logger = logging.getLogger(__name__)407 408 try:409 # Use default input if none provided410 if custom_input is None:411 if "website-content-crawler" in actor_id:412 actor_input = {413 "aggressivePrune": False,414 "clickElementsCssSelector": "[aria-expanded=\"false\"]",415 "clientSideMinChangePercentage": 15,416 "crawlerType": "playwright:adaptive",417 "debugLog": False,418 "debugMode": False,419 "dynamicContentWaitSecs": 15,420 "expandIframes": True,421 "ignoreCanonicalUrl": False,422 "keepUrlFragments": False,423 "proxyConfiguration": {424 "useApifyProxy": True,425 "apifyProxyGroups": ["RESIDENTIAL"]426 },427 "readableTextCharThreshold": 100,428 "removeCookieWarnings": True,429 "renderingTypeDetectionPercentage": 10,430 "respectRobotsTxtFile": False,431 "saveFiles": False,432 "saveHtml": False,433 "saveHtmlAsFile": True,434 "saveMarkdown": False,435 "saveScreenshots": False,436 "startUrls": [{"url": target_url, "method": "GET"}],437 "useSitemaps": False438 }439 else:440 # Default input for other actors441 actor_input = {442 "startUrls": [{"url": target_url, "method": "GET"}],443 "proxyConfiguration": {"useApifyProxy": True}444 }445 else:446 actor_input = custom_input447 448 # Determine retrieval method based on actor type449 if "website-content-crawler" in actor_id:450 retrieve_from = "key-value-store"451 else:452 retrieve_from = "dataset"453 454 # Run the actor with flexible retrieval455 results = run_apify_actor_with_flexible_retrieval(456 actor_id=actor_id,457 actor_input=actor_input,458 api_token=api_token,459 retrieve_from=retrieve_from460 )461 462 if not results:463 logger.warning(f"No results returned from {actor_id}")464 return None465 466 # Extract HTML content467 html_content = None468 469 if isinstance(results, list):470 # Dataset results471 for item in results:472 if isinstance(item, dict) and 'html' in item:473 html_content = item['html']474 break475 elif isinstance(results, dict):476 # Key-value store results477 for key, value in results.items():478 if key.endswith('.html') or 'html' in key.lower():479 if isinstance(value, str):480 html_content = value481 break482 elif isinstance(value, dict) and 'value' in value:483 html_content = value['value']484 break485 486 if html_content:487 logger.info(f"Successfully scraped {len(html_content)} characters from {target_url}")488 else:489 logger.warning(f"No HTML content found in results from {actor_id}")490 logger.debug(f"Result structure: {type(results)} with keys: {list(results.keys()) if isinstance(results, dict) else 'N/A'}")491 492 return html_content493 494 except Exception as e:495 logger.error(f"Error scraping with {actor_id}: {str(e)}", exc_info=True)496 return None497
498
499def scrape_website_with_multiple_actors(target_url: str, api_token: str) -> Dict[str, Optional[str]]:500 """501 Convenience function to scrape a website using multiple Apify actors.502 503 Args:504 target_url: The URL to scrape505 api_token: Apify API token506 507 Returns:508 Dictionary mapping actor names to their scraped HTML content509 """510 scraper = MultiActorScraper(api_token)511 return scraper.scrape_with_multiple_actors(target_url)512
513
514if __name__ == "__main__":515 # Example usage516 import os517 518 # Get API token from environment variable519 api_token = os.getenv("APIFY_TOKEN")520 if not api_token:521 print("Please set the APIFY_TOKEN environment variable")522 exit(1)523 524 # Example URL to scrape525 target_url = "https://fbref.com/en/players/3d1f29d9/matchlogs/2021-2022/Jordyn-Huitema-Match-Logs"526 527 # Create scraper and run528 scraper = MultiActorScraper(api_token)529 results = scraper.scrape_with_multiple_actors(target_url)530 531 # Print results summary532 print("\n=== SCRAPING RESULTS ===")533 for actor_name, html_content in results.items():534 if html_content:535 print(f"{actor_name}: SUCCESS (HTML length: {len(html_content)})")536 else:537 print(f"{actor_name}: FAILED")538 539 # Optionally save results to files540 scraper.save_results_to_files(results)
tests/examples.json
[ { "name": "Use Case 1", "page_url": "https://books.toscrape.com/", "goal": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock." }, { "name": "Use Case 2", "page_url": "https://www.theverge.com/", "goal": "I want to scrape the main articles from The Verge homepage. For each article, get me the headline, the author's name, and the link to the full article." }, { "name": "Use Case 3", "page_url": "https://en.wikipedia.org/wiki/Python_(programming_language)", "goal": "Get me information about the Python programming language from its Wikipedia page. I need the 'First appeared' date, the 'Stable release' version, and the official website from the infobox on the right." }, { "name": "Use Case 4", "page_url": "https://www.python.org/jobs/", "goal": "List all the jobs posted. For each job, I want the job title, the company name, the location, and the date it was posted." }, { "name": "Use Case 5", "page_url": "https://quotes.toscrape.com/", "goal": "I want a list of all quotes on this page. For each one, get the quote text itself, the name of the author, and a list of the tags associated with it." }]
src/llmscraper/__init__.py
1"""2ScraperCodeGenerator - Intelligent Web Scraping with AI3
4A smart web scraping framework that uses multiple scraping strategies5and AI-powered quality evaluation to extract data from websites.6"""7
8from .pipeline import IntelligentScraperPipeline, run_intelligent_scraper9from .models import ScrapingResult, GoalExtractionResult10from .main import main, extract_goal_from_prompt11from .scraping.actor_multi_scraper import ActorMultiScraper12
13__version__ = "0.1.0"14__all__ = [15 "IntelligentScraperPipeline",16 "run_intelligent_scraper",17 "ScrapingResult",18 "GoalExtractionResult",19 "main",20 "extract_goal_from_prompt",21 "ActorMultiScraper"22]
src/llmscraper/main.py
1"""2Main entry point and natural language processing for LLMScraper.3"""4
5import argparse6import logging7import re8import sys9from typing import Optional10
11import anthropic12
13from .models import ScrapingResult, GoalExtractionResult14from .pipeline import run_intelligent_scraper15from .utils import get_api_key, setup_logging16
17
18def extract_goal_from_prompt(prompt: str, claude_api_key: str) -> GoalExtractionResult:19 """20 Extract URL and scraping goal from a natural language prompt.21 22 Args:23 prompt: Natural language prompt containing URL and goal24 claude_api_key: Claude API key for processing25 26 Returns:27 GoalExtractionResult with extracted information28 """29 try:30 client = anthropic.Anthropic(api_key=claude_api_key)31 32 system_prompt = """You are an expert at parsing natural language requests for web scraping. 33Extract the URL and scraping goal from the user's prompt.34
35Return your answer in this EXACT JSON format:36{37 "url": "extracted_url_here",38 "goal": "clear_extraction_goal_here"39}40
41The goal should be specific and actionable for web scraping (e.g., "Get all product names and prices" rather than "scrape this site").42
43Only return the JSON, no other text."""44 45 response = client.messages.create(46 model="claude-3-5-sonnet-20241022",47 max_tokens=500,48 messages=[49 {50 "role": "user", 51 "content": f"Extract the URL and scraping goal from this prompt: {prompt}"52 }53 ],54 system=system_prompt55 )56 57 content = response.content[0].text58 59 # Extract JSON from response60 json_match = re.search(r'\{.*\}', content, re.DOTALL)61 if not json_match:62 return GoalExtractionResult(63 goal="",64 url="",65 success=False,66 error_message="Could not extract JSON from Claude response"67 )68 69 import json70 data = json.loads(json_match.group())71 72 url = data.get('url', '').strip()73 goal = data.get('goal', '').strip()74 75 if not url or not goal:76 return GoalExtractionResult(77 goal=goal,78 url=url,79 success=False,80 error_message="Missing URL or goal in extracted data"81 )82 83 return GoalExtractionResult(84 goal=goal,85 url=url,86 success=True87 )88 89 except Exception as e:90 return GoalExtractionResult(91 goal="",92 url="",93 success=False,94 error_message=f"Error extracting goal from prompt: {str(e)}"95 )96
97
98def main():99 """Main CLI entry point."""100 parser = argparse.ArgumentParser(101 description="LLMScraper - Intelligent Web Scraping with AI",102 formatter_class=argparse.RawDescriptionHelpFormatter,103 epilog="""104Examples:105 # Direct URL and goal106 llmscraper --url https://example.com --goal "Get all product names and prices"107 108 # Natural language prompt109 llmscraper --prompt "I want to scrape all book titles and prices from books.toscrape.com"110 111 # Actor mode (for Apify)112 llmscraper --url https://example.com --goal "Extract news headlines" --actor113 114 # Test the generated script115 llmscraper --url https://example.com --goal "Get product info" --test116 """117 )118 119 # Input options120 input_group = parser.add_mutually_exclusive_group(required=True)121 input_group.add_argument(122 '--prompt', 123 help='Natural language prompt with URL and extraction goal'124 )125 input_group.add_argument(126 '--url',127 help='Target URL to scrape (use with --goal)'128 )129 130 parser.add_argument(131 '--goal',132 help='Extraction goal (required when using --url)'133 )134 135 # API keys136 parser.add_argument(137 '--apify-token',138 help='Apify API token (or set APIFY_TOKEN env var)'139 )140 parser.add_argument(141 '--claude-key',142 help='Claude API key (or set CLAUDE_API_KEY env var)'143 )144 145 # Output options146 parser.add_argument(147 '--output', '-o',148 default='generated_scraper.py',149 help='Output file path for generated script (default: generated_scraper.py)'150 )151 parser.add_argument(152 '--actor',153 action='store_true',154 help='Generate script for Apify actor format'155 )156 157 # Execution options158 parser.add_argument(159 '--test',160 action='store_true',161 help='Test the generated script before saving'162 )163 parser.add_argument(164 '--no-prune-eval',165 action='store_true',166 help='Disable HTML pruning before quality evaluation'167 )168 169 # Logging170 parser.add_argument(171 '--verbose', '-v',172 action='store_true',173 help='Enable verbose logging'174 )175 parser.add_argument(176 '--quiet', '-q',177 action='store_true',178 help='Suppress non-error output'179 )180 181 args = parser.parse_args()182 183 # Setup logging184 if args.quiet:185 log_level = "ERROR"186 elif args.verbose:187 log_level = "DEBUG"188 else:189 log_level = "INFO"190 191 setup_logging(log_level)192 logger = logging.getLogger(__name__)193 194 # Validate arguments195 if args.url and not args.goal:196 logger.error("--goal is required when using --url")197 return 1198 199 # Get API keys200 apify_token = get_api_key("APIFY_TOKEN", args.apify_token)201 claude_key = get_api_key("CLAUDE_API_KEY", args.claude_key)202 203 if not apify_token:204 logger.error("APIFY_TOKEN is required (set environment variable or use --apify-token)")205 return 1206 207 if not claude_key:208 logger.error("CLAUDE_API_KEY is required (set environment variable or use --claude-key)")209 return 1210 211 # Process input212 if args.prompt:213 logger.info("🔍 Extracting URL and goal from natural language prompt...")214 extraction_result = extract_goal_from_prompt(args.prompt, claude_key)215 216 if not extraction_result.success:217 logger.error(f"Failed to extract goal from prompt: {extraction_result.error_message}")218 return 1219 220 target_url = extraction_result.url221 user_goal = extraction_result.goal222 223 logger.info(f"📍 Extracted URL: {target_url}")224 logger.info(f"🎯 Extracted Goal: {user_goal}")225 226 else: # args.url and args.goal227 target_url = args.url228 user_goal = args.goal229 230 # Run the pipeline231 logger.info("🚀 Starting intelligent scraper pipeline...")232 233 result = run_intelligent_scraper(234 target_url=target_url,235 user_goal=user_goal,236 apify_token=apify_token,237 claude_api_key=claude_key,238 output_path=None if args.actor else args.output,239 prune_before_evaluation=not args.no_prune_eval,240 test_script=args.test,241 for_actor=args.actor242 )243 244 # Handle results245 if result.success:246 logger.info("✅ SUCCESS! Pipeline completed successfully!")247 logger.info(f"🏆 Best actor: {result.best_actor}")248 249 if not args.actor:250 logger.info(f"📁 Generated script: {args.output}")251 252 if result.quality_scores:253 logger.info("📊 Quality scores:")254 for actor, score in result.quality_scores.items():255 emoji = "🟢" if score >= 7 else "🟡" if score >= 4 else "🔴"256 logger.info(f" {emoji} {actor}: {score}/10")257 258 if args.test and result.extracted_data:259 data_count = len(result.extracted_data) if isinstance(result.extracted_data, list) else 1260 logger.info(f"🧪 Test extraction successful: {data_count} items")261 262 if args.actor:263 print("\\n" + "="*60)264 print("📄 GENERATED ACTOR SCRIPT")265 print("="*60)266 print(result.generated_script)267 else:268 logger.info("\\n🚀 Next steps:")269 logger.info(f" 1. Review the generated script: {args.output}")270 logger.info(f" 2. Run the scraper: python {args.output}")271 logger.info(" 3. Check the extracted JSON data")272 273 return 0274 275 else:276 logger.error(f"❌ FAILED: {result.error_message}")277 if result.quality_scores:278 logger.info("📊 Quality scores achieved:")279 for actor, score in result.quality_scores.items():280 emoji = "🟢" if score >= 7 else "🟡" if score >= 4 else "🔴"281 logger.info(f" {emoji} {actor}: {score}/10")282 return 1283
284
285if __name__ == "__main__":286 sys.exit(main())
src/llmscraper/models.py
1"""2Data models for the ScraperCodeGenerator pipeline.3"""4
5from dataclasses import dataclass6from typing import Dict, Any, Optional, List7
8
9@dataclass10class ScrapingResult:11 """Result of the complete scraping pipeline."""12 success: bool13 generated_script: Optional[str] = None14 best_actor: Optional[str] = None15 schema: Optional[Dict[str, Any]] = None16 error_message: Optional[str] = None17 quality_scores: Optional[Dict[str, int]] = None18 extracted_data: Optional[List[Dict[str, Any]]] = None19
20
21@dataclass22class EvaluationResult:23 """Result of HTML quality evaluation."""24 score: int # 1-10 scale25 reasoning: str26
27
28@dataclass29class PreEvaluationResult:30 """Result of pre-evaluation checks before sending to Claude."""31 is_valid_html: bool32 score: Optional[int] = None # If we can determine score without Claude33 reasoning: Optional[str] = None34 should_continue_to_claude: bool = True35
36
37@dataclass38class GoalExtractionResult:39 """Result of extracting goal from natural language prompt."""40 goal: str41 url: str42 success: bool43 error_message: Optional[str] = None
src/llmscraper/pipeline.py
1"""2Main pipeline for intelligent web scraping.3"""4
5import logging6from typing import Optional7
8from .models import ScrapingResult9from .scraping import MultiActorScraper10from .scraping.actor_multi_scraper import ActorMultiScraper11from .evaluation import HTMLQualityEvaluator12from .generation import ScriptGenerator, ScriptExecutor13from .utils import prune_html, validate_required_keys, get_api_key14
15
16class IntelligentScraperPipeline:17 """Main pipeline class that orchestrates the intelligent web scraping process."""18 19 def __init__(self, apify_token: str, claude_api_key: str, actor_logger=None):20 """21 Initialize the pipeline with required API tokens.22 23 Args:24 apify_token: Apify API token for web scraping25 claude_api_key: Anthropic Claude API key for AI analysis26 actor_logger: Optional Actor logger for actor mode27 """28 # Validate API keys29 validated_keys = validate_required_keys(30 apify_token=apify_token,31 claude_api_key=claude_api_key32 )33 34 self.apify_token = validated_keys['apify_token']35 self.claude_api_key = validated_keys['claude_api_key']36 37 # Initialize components38 self.multi_scraper = MultiActorScraper(self.apify_token)39 self.actor_scraper = ActorMultiScraper() # For actor-to-actor communication40 self.quality_evaluator = HTMLQualityEvaluator(self.claude_api_key)41 self.script_generator = ScriptGenerator(self.claude_api_key)42 self.script_executor = ScriptExecutor()43 44 # Setup logging - use Actor logger if provided, otherwise standard logging45 self.logger = actor_logger if actor_logger else logging.getLogger(__name__)46 self.is_actor_mode = actor_logger is not None47 48 async def run_complete_pipeline(self, target_url: str, user_goal: str, 49 output_script_path: Optional[str] = None,50 prune_before_evaluation: bool = True,51 test_script: bool = False,52 for_actor: bool = False) -> ScrapingResult:53 """54 Run the complete intelligent scraping pipeline.55 56 Args:57 target_url: The URL to scrape58 user_goal: Natural language description of what to extract59 output_script_path: Path where to save the generated script (None for actor mode)60 prune_before_evaluation: If True, prune HTML before quality evaluation61 test_script: If True, test the generated script before finalizing62 for_actor: If True, generate script for Apify actor format63 64 Returns:65 ScrapingResult containing the outcome and generated artifacts66 """67 self.logger.info(f"PIPELINE: Starting intelligent scraping pipeline for: {target_url}")68 self.logger.info(f"PIPELINE: User goal: {user_goal}")69 self.logger.info(f"PIPELINE: Actor mode: {for_actor}")70 71 try:72 # Step 1: Run multiple actors to scrape the website73 self.logger.info("PIPELINE: Step 1: Running multi-actor scraping...")74 75 # Use actor-aware scraper if running inside an Apify actor76 if for_actor:77 self.logger.info("PIPELINE: Using actor-to-actor communication...")78 scraping_results = await self.actor_scraper.scrape_with_multiple_actors(target_url)79 else:80 self.logger.info("PIPELINE: Using client-based scraping...")81 scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url)82 83 if not any(content for content in scraping_results.values() if content):84 return ScrapingResult(85 success=False,86 error_message="All scraping actors failed to retrieve content"87 )88 89 # Step 2: Evaluate quality of each result90 self.logger.info("PIPELINE: Step 2: Evaluating HTML quality for each actor...")91 quality_scores, best_actor, best_html = self._evaluate_html_quality(92 scraping_results, user_goal, prune_before_evaluation93 )94 95 if not best_html:96 return ScrapingResult(97 success=False,98 error_message="No actor produced quality HTML content",99 quality_scores=quality_scores100 )101 102 self.logger.info(f"PIPELINE: Best actor selected: {best_actor} with score {quality_scores[best_actor]}/10")103 104 # Step 3: Prune the best HTML to reduce token count105 self.logger.info("PIPELINE: Step 3: Pruning HTML content...")106 pruned_html = prune_html(best_html, max_list_items=4, max_text_length=500)107 108 original_length = len(best_html)109 pruned_length = len(pruned_html)110 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0111 112 self.logger.info(f"PIPELINE: HTML pruned: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")113 114 # Step 4: Generate Python scraping script115 self.logger.info("PIPELINE: Step 4: Generating Python scraping script...")116 generated_script = self.script_generator.generate_scraping_script(117 target_url, best_actor, pruned_html, user_goal, for_actor118 )119 120 if not generated_script:121 return ScrapingResult(122 success=False,123 error_message="Failed to generate scraping script",124 best_actor=best_actor,125 quality_scores=quality_scores126 )127 128 # Step 5: Test the script if requested129 extracted_data = None130 if test_script:131 self.logger.info("PIPELINE: Step 5: Testing generated script...")132 test_result = self.script_executor.test_script(generated_script, best_html)133 134 if test_result["success"]:135 self.logger.info(f"PIPELINE: ✅ Script test passed! Extracted {test_result.get('item_count', 0)} items")136 extracted_data = test_result["data"]137 else:138 self.logger.warning(f"PIPELINE: ⚠️ Script test failed: {test_result['error']}")139 # Continue anyway, but log the issue140 141 # Step 6: Save the generated script (only if not actor mode)142 if output_script_path and not for_actor:143 self.logger.info(f"PIPELINE: Step 6: Saving generated script to {output_script_path}")144 with open(output_script_path, 'w', encoding='utf-8') as f:145 f.write(generated_script)146 147 self.logger.info("PIPELINE: ✅ Pipeline completed successfully!")148 149 return ScrapingResult(150 success=True,151 generated_script=generated_script,152 best_actor=best_actor,153 quality_scores=quality_scores,154 extracted_data=extracted_data155 )156 157 except Exception as e:158 self.logger.error(f"PIPELINE: Pipeline failed with error: {str(e)}")159 return ScrapingResult(160 success=False,161 error_message=f"Pipeline error: {str(e)}"162 )163 164 def _evaluate_html_quality(self, scraping_results: dict, user_goal: str, 165 prune_before_evaluation: bool) -> tuple[dict, str, str]:166 """Evaluate HTML quality for each scraping result."""167 quality_scores = {}168 best_actor = None169 best_html = None170 best_score = 0171 172 for actor_name, html_content in scraping_results.items():173 if html_content:174 self.logger.info(f"PIPELINE: Evaluating {actor_name}...")175 176 # Optionally prune HTML before evaluation177 evaluation_html = html_content178 if prune_before_evaluation:179 original_length = len(html_content)180 evaluation_html = prune_html(html_content, max_list_items=3, max_text_length=100)181 pruned_length = len(evaluation_html)182 reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0183 self.logger.info(f"PIPELINE: {actor_name} HTML pruned for evaluation: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")184 185 evaluation = self.quality_evaluator.evaluate_html_quality(user_goal, evaluation_html)186 187 if evaluation:188 quality_scores[actor_name] = evaluation.score189 self.logger.info(f"PIPELINE: {actor_name} quality score: {evaluation.score}/10 - {evaluation.reasoning}")190 191 if evaluation.score > best_score:192 best_score = evaluation.score193 best_actor = actor_name194 best_html = html_content # Keep original HTML, not pruned version195 else:196 quality_scores[actor_name] = 0197 self.logger.warning(f"PIPELINE: Failed to evaluate {actor_name}")198 else:199 quality_scores[actor_name] = 0200 self.logger.warning(f"PIPELINE: {actor_name} returned no content")201 202 return quality_scores, best_actor, best_html203
204
205async def run_intelligent_scraper(target_url: str, user_goal: str, 206 apify_token: Optional[str] = None,207 claude_api_key: Optional[str] = None,208 output_path: Optional[str] = "generated_scraper.py",209 prune_before_evaluation: bool = True,210 test_script: bool = False,211 for_actor: bool = False,212 actor_logger=None) -> ScrapingResult:213 """214 Convenience function to run the complete intelligent scraping pipeline.215 216 Args:217 target_url: URL to scrape218 user_goal: Natural language description of extraction goal219 apify_token: Apify API token (uses APIFY_TOKEN env var if not provided)220 claude_api_key: Claude API key (uses CLAUDE_API_KEY env var if not provided)221 output_path: Path to save the generated script (None for actor mode)222 prune_before_evaluation: If True, prune HTML before quality evaluation223 test_script: If True, test the generated script before finalizing224 for_actor: If True, generate script for Apify actor format225 actor_logger: Optional Actor logger for actor mode226 227 Returns:228 ScrapingResult with the outcome229 """230 # Get tokens from environment if not provided231 if not apify_token:232 apify_token = get_api_key("APIFY_TOKEN")233 if not claude_api_key:234 claude_api_key = get_api_key("CLAUDE_API_KEY")235 236 if not apify_token:237 return ScrapingResult(238 success=False,239 error_message="APIFY_TOKEN not provided and not found in environment variables"240 )241 242 if not claude_api_key:243 return ScrapingResult(244 success=False,245 error_message="CLAUDE_API_KEY not provided and not found in environment variables"246 )247 248 # Create and run pipeline249 pipeline = IntelligentScraperPipeline(apify_token, claude_api_key, actor_logger)250 return await pipeline.run_complete_pipeline(251 target_url, user_goal, output_path, prune_before_evaluation, test_script, for_actor252 )
src/llmscraper/tempCodeRunnerFile.py
1generated_scraper
src/llmscraper/evaluation/__init__.py
1"""2Evaluation module for ScraperCodeGenerator.3"""4
5from .html_quality_evaluator import HTMLQualityEvaluator6
7__all__ = ["HTMLQualityEvaluator"]
src/llmscraper/evaluation/html_quality_evaluator.py
1"""2HTML quality evaluation using Claude AI.3"""4
5import json6import logging7import re8from typing import Optional9
10import anthropic11
12from ..models import EvaluationResult, PreEvaluationResult13
14
15class HTMLQualityEvaluator:16 """Evaluates HTML quality for web scraping using Claude AI."""17 18 def __init__(self, claude_api_key: str):19 """Initialize with Claude API key."""20 if not claude_api_key or not claude_api_key.strip():21 raise ValueError("Claude API key cannot be empty")22 23 self.client = anthropic.Anthropic(api_key=claude_api_key)24 self.logger = logging.getLogger(__name__)25 26 def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:27 """28 Evaluate HTML quality for data extraction.29 30 Args:31 user_goal: User's extraction goal32 html_content: HTML content to evaluate33 34 Returns:35 EvaluationResult or None if evaluation fails36 """37 try:38 # Pre-evaluation checks39 pre_eval = self._pre_evaluate_html(html_content)40 if not pre_eval.should_continue_to_claude:41 if pre_eval.score is not None:42 return EvaluationResult(score=pre_eval.score, reasoning=pre_eval.reasoning)43 return None44 45 # Claude evaluation46 return self._evaluate_with_claude(user_goal, html_content)47 48 except Exception as e:49 self.logger.error(f"Error evaluating HTML quality: {str(e)}")50 return None51 52 def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:53 """Perform basic HTML validation checks."""54 if not html_content or not html_content.strip():55 return PreEvaluationResult(56 is_valid_html=False,57 score=1,58 reasoning="Empty or whitespace-only HTML content",59 should_continue_to_claude=False60 )61 62 # Check for common failure indicators63 content_lower = html_content.lower()64 65 # Bot detection/blocking indicators66 blocking_indicators = [67 'please verify you are a human',68 'access denied',69 'blocked',70 'captcha',71 'cloudflare',72 'ddos protection',73 'security check',74 'bot detected'75 ]76 77 for indicator in blocking_indicators:78 if indicator in content_lower:79 return PreEvaluationResult(80 is_valid_html=False,81 score=1,82 reasoning=f"HTML appears to be blocked/bot-detected (found: '{indicator}')",83 should_continue_to_claude=False84 )85 86 # Check for minimal HTML structure87 if not re.search(r'<html|<body|<div|<p|<span', content_lower):88 return PreEvaluationResult(89 is_valid_html=False,90 score=2,91 reasoning="HTML lacks basic structural elements",92 should_continue_to_claude=False93 )94 95 return PreEvaluationResult(96 is_valid_html=True,97 should_continue_to_claude=True98 )99 100 def _evaluate_with_claude(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:101 """Evaluate HTML using Claude AI."""102 try:103 prompt = self._create_evaluation_prompt(user_goal, html_content)104 105 response = self.client.messages.create(106 model="claude-3-5-sonnet-20241022",107 max_tokens=500,108 messages=[{"role": "user", "content": prompt}]109 )110 111 content = response.content[0].text112 return self._parse_evaluation_response(content)113 114 except Exception as e:115 self.logger.error(f"Error in Claude evaluation: {str(e)}")116 return None117 118 def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:119 """Create the evaluation prompt for Claude."""120 return f"""You are an expert web scraper evaluator. Analyze the provided HTML and determine how suitable it is for extracting the requested data.121
122USER EXTRACTION GOAL:123{user_goal}124
125HTML CONTENT TO EVALUATE:126{html_content}127
128Evaluate the HTML on a scale of 1-10 based on:1291. Presence of the target data elements1302. HTML structure quality and accessibility1313. Whether the page loaded correctly (not blocked, error page, etc.)1324. How easy it would be to extract the requested data133
134Return your evaluation in this EXACT JSON format:135{{136 "score": [1-10 integer],137 "reasoning": "[brief explanation of the score]"138}}139
140Only return the JSON, no other text.141"""142 143 def _parse_evaluation_response(self, response: str) -> Optional[EvaluationResult]:144 """Parse Claude's evaluation response."""145 try:146 # Extract JSON from response147 json_match = re.search(r'\{.*\}', response, re.DOTALL)148 if not json_match:149 raise ValueError("No JSON found in response")150 151 data = json.loads(json_match.group())152 153 score = data.get('score')154 reasoning = data.get('reasoning', '')155 156 if not isinstance(score, int) or score < 1 or score > 10:157 raise ValueError(f"Invalid score: {score}")158 159 return EvaluationResult(score=score, reasoning=reasoning)160 161 except Exception as e:162 self.logger.error(f"Error parsing evaluation response: {str(e)}")163 return None
src/llmscraper/generation/__init__.py
1"""2Generation module for ScraperCodeGenerator.3"""4
5from .script_generator import ScriptGenerator6from .script_executor import ScriptExecutor7
8__all__ = ["ScriptGenerator", "ScriptExecutor"]
src/llmscraper/generation/script_executor.py
1"""2Script execution and testing functionality.3"""4
5import subprocess6import tempfile7import os8import json9import logging10from typing import Dict, Any, Optional11import ast12import traceback13
14
15class ScriptExecutor:16 """Executes and tests generated scraping scripts."""17 18 def __init__(self):19 """Initialize the script executor."""20 self.logger = logging.getLogger(__name__)21 22 def test_script(self, script_content: str, html_content: str) -> Dict[str, Any]:23 """24 Test a scraping script against sample HTML content.25 26 Args:27 script_content: The Python script to test28 html_content: Sample HTML to test against29 30 Returns:31 Dict with test results including success, data, and errors32 """33 try:34 # Extract the extract_data function from the script35 extract_function = self._extract_function_from_script(script_content, 'extract_data')36 37 if not extract_function:38 return {39 "success": False,40 "error": "Could not find extract_data function in script",41 "data": None42 }43 44 # Create a safe execution environment45 safe_globals = {46 '__builtins__': {47 'len': len,48 'str': str,49 'int': int,50 'float': float,51 'bool': bool,52 'list': list,53 'dict': dict,54 'range': range,55 'enumerate': enumerate,56 'zip': zip,57 'isinstance': isinstance,58 'hasattr': hasattr,59 'getattr': getattr,60 'print': print,61 }62 }63 64 # Import necessary modules into the environment65 exec("from bs4 import BeautifulSoup", safe_globals)66 exec("import re", safe_globals)67 exec("import json", safe_globals)68 69 # Execute the function definition70 exec(extract_function, safe_globals)71 72 # Call the function with the HTML content73 extracted_data = safe_globals['extract_data'](html_content)74 75 return {76 "success": True,77 "data": extracted_data,78 "error": None,79 "data_type": type(extracted_data).__name__,80 "item_count": len(extracted_data) if isinstance(extracted_data, (list, dict)) else 181 }82 83 except Exception as e:84 self.logger.error(f"Error testing script: {str(e)}")85 return {86 "success": False,87 "error": str(e),88 "data": None,89 "traceback": traceback.format_exc()90 }91 92 def _extract_function_from_script(self, script_content: str, function_name: str) -> Optional[str]:93 """Extract a specific function from a script."""94 try:95 # Parse the script into an AST96 tree = ast.parse(script_content)97 98 # Find the function definition99 for node in ast.walk(tree):100 if isinstance(node, ast.FunctionDef) and node.name == function_name:101 # Get the source code of the function102 lines = script_content.split('\n')103 start_line = node.lineno - 1104 105 # Find the end of the function106 end_line = start_line + 1107 while end_line < len(lines):108 line = lines[end_line]109 # Check if this line starts a new function or class110 if line.strip() and not line.startswith(' ') and not line.startswith('\t'):111 break112 end_line += 1113 114 return '\n'.join(lines[start_line:end_line])115 116 return None117 118 except Exception as e:119 self.logger.error(f"Error extracting function: {str(e)}")120 return None121 122 def validate_script_syntax(self, script_content: str) -> Dict[str, Any]:123 """124 Validate the syntax of a Python script.125 126 Args:127 script_content: The Python script to validate128 129 Returns:130 Dict with validation results131 """132 try:133 # Try to parse the script134 ast.parse(script_content)135 136 return {137 "valid": True,138 "error": None139 }140 141 except SyntaxError as e:142 return {143 "valid": False,144 "error": f"Syntax error: {str(e)}",145 "line": e.lineno,146 "offset": e.offset147 }148 except Exception as e:149 return {150 "valid": False,151 "error": f"Parse error: {str(e)}"152 }153 154 def run_script_in_sandbox(self, script_content: str, timeout: int = 60) -> Dict[str, Any]:155 """156 Run a complete script in a sandboxed environment.157 158 Args:159 script_content: The complete Python script160 timeout: Maximum execution time in seconds161 162 Returns:163 Dict with execution results164 """165 try:166 # Create a temporary file167 with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:168 temp_file.write(script_content)169 temp_file_path = temp_file.name170 171 try:172 # Run the script173 result = subprocess.run(174 ['python', temp_file_path],175 capture_output=True,176 text=True,177 timeout=timeout,178 cwd=os.path.dirname(temp_file_path)179 )180 181 return {182 "success": result.returncode == 0,183 "stdout": result.stdout,184 "stderr": result.stderr,185 "return_code": result.returncode186 }187 188 finally:189 # Clean up the temporary file190 os.unlink(temp_file_path)191 192 except subprocess.TimeoutExpired:193 return {194 "success": False,195 "stdout": "",196 "stderr": f"Script execution timed out after {timeout} seconds",197 "return_code": -1198 }199 except Exception as e:200 return {201 "success": False,202 "stdout": "",203 "stderr": str(e),204 "return_code": -1205 }
src/llmscraper/generation/script_generator.py
1"""2Code generation functionality for creating scraping scripts.3"""4
5import logging6from typing import Optional7import re8
9import anthropic10
11
12class ScriptGenerator:13 """Generates Python scraping scripts using Claude AI."""14 15 def __init__(self, claude_api_key: str):16 """Initialize with Claude API key."""17 if not claude_api_key or not claude_api_key.strip():18 raise ValueError("Claude API key cannot be empty")19 20 self.client = anthropic.Anthropic(api_key=claude_api_key)21 self.logger = logging.getLogger(__name__)22 23 def generate_scraping_script(self, target_url: str, best_actor: str, 24 pruned_html: str, user_goal: str, 25 for_actor: bool = False) -> Optional[str]:26 """27 Generate a complete Python scraping script.28 29 Args:30 target_url: The target URL to scrape31 best_actor: Name of the best performing actor32 pruned_html: Sample HTML content for reference33 user_goal: User's extraction goal34 for_actor: If True, generate for Apify actor (key-value store output)35 36 Returns:37 Complete Python script as string, or None if generation fails38 """39 try:40 # Generate the HTML parsing code from Claude41 parsing_code = self._generate_html_parsing_code(pruned_html, user_goal)42 43 if not parsing_code:44 self.logger.error("Failed to generate HTML parsing code")45 return None46 47 # Create the complete script48 if for_actor:49 return self._create_actor_script(target_url, best_actor, parsing_code, user_goal)50 else:51 return self._create_standalone_script(target_url, best_actor, parsing_code, user_goal)52 53 except Exception as e:54 self.logger.error(f"Error generating scraping script: {str(e)}")55 return None56 57 def _generate_html_parsing_code(self, pruned_html: str, user_goal: str) -> Optional[str]:58 """Generate HTML parsing/extraction code using Claude."""59 try:60 prompt = f"""Generate Python code that parses HTML content and extracts data based on the user's goal. You should generate ONLY the extraction logic, not a complete script.61
62## USER GOAL:63{user_goal}64
65## SAMPLE HTML (for reference):66{pruned_html}67
68## REQUIREMENTS:691. Create a function called `extract_data(html_content)` that takes HTML string as input702. Use BeautifulSoup to parse the HTML713. Extract the data according to the user's goal using CSS selectors, attributes, text content, etc.724. Return the extracted data as a Python dictionary or list of dictionaries735. Handle missing or malformed data gracefully746. Include appropriate error handling75
76## EXAMPLE OUTPUT FORMAT:77```python78def extract_data(html_content):79 from bs4 import BeautifulSoup80 81 soup = BeautifulSoup(html_content, 'html.parser')82 results = []83 84 # Your extraction logic here85 # Use soup.find(), soup.find_all(), CSS selectors, etc.86 87 return results88```89
90Generate ONLY the `extract_data` function and any helper functions needed. Do not include imports outside the function, full scripts, or other boilerplate code."""91
92 self.logger.info("Requesting HTML parsing code generation from Claude...")93 94 response = self.client.messages.create(95 model="claude-3-5-sonnet-20241022",96 max_tokens=2000,97 messages=[{"role": "user", "content": prompt}]98 )99 100 parsing_code = response.content[0].text101 102 # Extract Python code from response if wrapped in code blocks103 if "```python" in parsing_code:104 code_match = re.search(r'```python\n(.*?)\n```', parsing_code, re.DOTALL)105 if code_match:106 parsing_code = code_match.group(1)107 108 return parsing_code109 110 except Exception as e:111 self.logger.error(f"Error generating HTML parsing code: {str(e)}")112 return None113 114 def _create_standalone_script(self, target_url: str, best_actor: str, 115 parsing_code: str, user_goal: str) -> str:116 """Create a standalone Python script."""117 return f'''#!/usr/bin/env python3118"""119Generated Web Scraper120Target: {target_url}121Goal: {user_goal}122Best Actor: {best_actor}123Generated by: ScraperCodeGenerator124"""125
126import os127import json128import logging129from typing import Dict, Any, List, Optional130from bs4 import BeautifulSoup131
132from llmscraper.scraping import MultiActorScraper133
134
135{parsing_code}136
137
138def main():139 """Main function to run the scraper."""140 # Setup logging141 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')142 logger = logging.getLogger(__name__)143 144 # Configuration145 target_url = "{target_url}"146 apify_token = os.getenv("APIFY_TOKEN")147 148 if not apify_token:149 logger.error("APIFY_TOKEN environment variable not set")150 logger.info("Please set your Apify API token: export APIFY_TOKEN='your_token_here'")151 return152 153 try:154 logger.info(f"🚀 Starting scraper for: {{target_url}}")155 logger.info(f"📝 Goal: {user_goal}")156 logger.info(f"🏆 Using best actor: {best_actor}")157 158 # Initialize the multi-actor scraper159 scraper = MultiActorScraper(apify_token)160 161 # Get the actor configuration for the best performing actor162 actor_configs = scraper._get_actor_configs(target_url)163 164 if "{best_actor}" not in actor_configs:165 logger.error(f"Best actor '{best_actor}' not found in available actors")166 return167 168 # Run the best actor to get HTML169 logger.info(f"🔄 Running {{'{best_actor}'}} actor...")170 actor_name, html_content = scraper._run_single_actor("{best_actor}", actor_configs["{best_actor}"])171 172 if not html_content:173 logger.error(f"Failed to get HTML content from {{'{best_actor}'}} actor")174 return175 176 logger.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")177 178 # Extract data using the generated parsing code179 logger.info("🔍 Extracting data from HTML...")180 extracted_data = extract_data(html_content)181 182 if not extracted_data:183 logger.warning("No data was extracted from the HTML")184 return185 186 # Prepare final results187 results = {{188 "target_url": target_url,189 "extraction_goal": "{user_goal}",190 "actor_used": "{best_actor}",191 "data": extracted_data,192 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1193 }}194 195 # Output results196 print("\\n" + "="*60)197 print("📊 EXTRACTION RESULTS")198 print("="*60)199 print(json.dumps(results, indent=2, ensure_ascii=False))200 201 # Save to file202 output_file = "extracted_data.json"203 with open(output_file, 'w', encoding='utf-8') as f:204 json.dump(results, f, indent=2, ensure_ascii=False)205 206 logger.info(f"💾 Results saved to {{output_file}}")207 logger.info(f"🎉 Successfully extracted {{results['total_items']}} items!")208 209 except Exception as e:210 logger.error(f"❌ Scraping failed: {{e}}")211 import traceback212 traceback.print_exc()213
214
215if __name__ == "__main__":216 main()217'''218 219 def _create_actor_script(self, target_url: str, best_actor: str, 220 parsing_code: str, user_goal: str) -> str:221 """Create a script for Apify actor."""222 return f'''"""223Apify Actor Script224Target: {target_url}225Goal: {user_goal}226Best Actor: {best_actor}227Generated by: ScraperCodeGenerator228"""229
230import json231from apify import Actor232from bs4 import BeautifulSoup233
234from llmscraper.scraping import MultiActorScraper235
236
237{parsing_code}238
239
240async def main():241 """Main actor function."""242 async with Actor:243 # Get input244 actor_input = await Actor.get_input() or {{}}245 target_url = actor_input.get('targetUrl', '{target_url}')246 user_goal = actor_input.get('userGoal', '{user_goal}')247 apify_token = actor_input.get('apifyToken') or Actor.config.token248 249 Actor.log.info(f"🚀 Starting scraper for: {{target_url}}")250 Actor.log.info(f"📝 Goal: {{user_goal}}")251 Actor.log.info(f"🏆 Using best actor: {best_actor}")252 253 try:254 # Initialize the multi-actor scraper255 scraper = MultiActorScraper(apify_token)256 257 # Run the best actor to get HTML258 Actor.log.info(f"🔄 Running {best_actor} actor...")259 actor_name, html_content = scraper._run_single_actor(260 "{best_actor}", 261 scraper._get_actor_configs(target_url)["{best_actor}"]262 )263 264 if not html_content:265 await Actor.fail(f"Failed to get HTML content from {best_actor} actor")266 return267 268 Actor.log.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")269 270 # Extract data using the generated parsing code271 Actor.log.info("🔍 Extracting data from HTML...")272 extracted_data = extract_data(html_content)273 274 if not extracted_data:275 Actor.log.warning("No data was extracted from the HTML")276 extracted_data = []277 278 # Prepare final results279 results = {{280 "target_url": target_url,281 "extraction_goal": user_goal,282 "actor_used": "{best_actor}",283 "data": extracted_data,284 "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1285 }}286 287 # Save to key-value store288 await Actor.set_value('OUTPUT', results)289 290 Actor.log.info(f"🎉 Successfully extracted {{results['total_items']}} items!")291 Actor.log.info("💾 Results saved to key-value store as 'OUTPUT'")292 293 except Exception as e:294 Actor.log.error(f"❌ Scraping failed: {{e}}")295 await Actor.fail(str(e))296
297
298if __name__ == "__main__":299 import asyncio300 asyncio.run(main())301'''
src/llmscraper/scraping/__init__.py
1"""2Scraping module for ScraperCodeGenerator.3"""4
5from .apify_runner import ApifyRunner6from .multi_actor_scraper import MultiActorScraper7from .actor_multi_scraper import ActorMultiScraper8
9__all__ = ["ApifyRunner", "MultiActorScraper", "ActorMultiScraper"]
src/llmscraper/scraping/actor_multi_scraper.py
1"""2Apify Actor-specific scraping module for running other actors from within an Apify actor.3"""4
5import logging6from typing import Dict, Any, List, Optional7from apify import Actor8
9
10class ActorMultiScraper:11 """Handles running multiple Apify actors from within an Apify actor context."""12 13 def __init__(self):14 """Initialize the actor scraper."""15 self.logger = logging.getLogger(__name__)16 17 async def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, Optional[str]]:18 """19 Run multiple actors to scrape the target URL and return HTML content.20 21 Args:22 target_url: The URL to scrape23 24 Returns:25 Dictionary mapping actor names to their HTML content (or None if failed)26 """27 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors called for {target_url}")28 results = {}29 30 # Define actor configurations31 actor_configs = self._get_actor_configs(target_url)32 Actor.log.info(f"DEBUG: Will run {len(actor_configs)} actors: {list(actor_configs.keys())}")33 34 # Run each actor and collect results35 for actor_name, config in actor_configs.items():36 try:37 Actor.log.info(f"DEBUG: Starting {actor_name}...")38 html_content = await self._run_single_actor(actor_name, config)39 results[actor_name] = html_content40 41 if html_content:42 Actor.log.info(f"DEBUG: {actor_name} succeeded: {len(html_content):,} characters")43 else:44 Actor.log.warning(f"DEBUG: {actor_name} returned no content")45 46 except Exception as e:47 Actor.log.error(f"DEBUG: {actor_name} failed: {str(e)}")48 results[actor_name] = None49 50 Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors completed. Results: {list(results.keys())}")51 Actor.log.info(f"DEBUG: Results with content: {[name for name, content in results.items() if content]}")52 return results53 54 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:55 """Get configurations for all actors to run."""56 return {57 "cheerio-scraper": {58 "actor_id": "apify/cheerio-scraper",59 "input": {60 "startUrls": [{"url": target_url}],61 "maxRequestRetries": 3,62 "requestTimeoutSecs": 30,63 "maxPagesPerCrawl": 1,64 "pageFunction": """65 async function pageFunction(context) {66 const { request, log, $ } = context;67 try {68 const title = $('title').text() || '';69 const html = $('html').html() || '';70 return {71 url: request.url,72 title: title,73 html: html74 };75 } catch (error) {76 log.error('Error in pageFunction:', error);77 return {78 url: request.url,79 title: '',80 html: ''81 };82 }83 }84 """,85 "proxyConfiguration": {"useApifyProxy": True}86 }87 },88 "web-scraper": {89 "actor_id": "apify/web-scraper",90 "input": {91 "startUrls": [{"url": target_url}],92 "maxRequestRetries": 3,93 "requestTimeoutSecs": 30,94 "maxPagesPerCrawl": 1,95 "pageFunction": """96 async function pageFunction(context) {97 const { request, log, page } = context;98 try {99 const title = await page.title();100 const html = await page.content();101 return { url: request.url, title, html };102 } catch (error) {103 log.error('Error in pageFunction:', error);104 return { url: request.url, title: '', html: '' };105 }106 }107 """,108 "proxyConfiguration": {"useApifyProxy": True}109 }110 },111 "website-content-crawler": {112 "actor_id": "apify/website-content-crawler",113 "input": {114 "startUrls": [{"url": target_url}],115 "maxRequestsPerCrawl": 1,116 "maxCrawlDepth": 0,117 "htmlTransformer": "readableText",118 "readableTextCharThreshold": 100,119 "removeCookieWarnings": True,120 "clickElementsCssSelector": "",121 "proxyConfiguration": {"useApifyProxy": True}122 }123 }124 }125 126 async def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:127 """128 Run a single actor and extract HTML content.129 130 Args:131 actor_name: Name of the actor (for logging)132 config: Actor configuration including actor_id and input133 134 Returns:135 HTML content as string, or None if failed136 """137 try:138 actor_id = config["actor_id"]139 actor_input = config["input"]140 141 Actor.log.info(f"DEBUG: Calling actor {actor_id}")142 143 # Call the actor using Apify SDK - use the exact same pattern as working code144 run = await Actor.call(145 actor_id=actor_id,146 run_input=actor_input147 )148 149 if not run:150 Actor.log.error(f"DEBUG: Actor {actor_name} failed to start - run is None")151 return None152 153 Actor.log.info(f"DEBUG: Actor {actor_name} run created with ID: {run.id}")154 Actor.log.info(f"DEBUG: Default dataset ID: {run.default_dataset_id}")155 156 # Use the exact same pattern as your working code157 if run.default_dataset_id:158 try:159 Actor.log.info(f"DEBUG: Getting dataset items for {actor_name}...")160 items = (await Actor.apify_client.dataset(run.default_dataset_id).list_items()).items161 162 if items:163 Actor.log.info(f"DEBUG: Found {len(items)} items in dataset for {actor_name}")164 165 for i, item in enumerate(items):166 Actor.log.info(f"DEBUG: Dataset item {i} keys: {list(item.keys()) if isinstance(item, dict) else type(item)}")167 168 # Look for HTML content in the item169 html_content = self._extract_html_from_item(item, actor_name)170 if html_content:171 Actor.log.info(f"DEBUG: Found HTML content in dataset item {i} for {actor_name}: {len(html_content)} characters")172 return html_content173 else:174 Actor.log.info(f"DEBUG: No dataset items found for {actor_name}")175 176 except Exception as e:177 Actor.log.error(f"DEBUG: Dataset retrieval failed for {actor_name}: {e}")178 import traceback179 Actor.log.error(f"DEBUG: Dataset traceback: {traceback.format_exc()}")180 181 # Fallback: Try key-value store (simplified)182 if run.default_key_value_store_id:183 try:184 Actor.log.info(f"DEBUG: Trying key-value store as fallback for {actor_name}...")185 kvs_client = Actor.apify_client.key_value_store(run.default_key_value_store_id)186 187 # Try common keys that might contain HTML188 common_keys = ['OUTPUT', 'RESULTS', 'DATA']189 for key_name in common_keys:190 try:191 record = await kvs_client.get_record(key_name)192 if record:193 Actor.log.info(f"DEBUG: Found record for key {key_name}")194 html_content = self._extract_html_from_record(record, actor_name)195 if html_content:196 Actor.log.info(f"DEBUG: Found HTML content in key {key_name} for {actor_name}")197 return html_content198 except Exception:199 pass # Key doesn't exist, continue200 201 except Exception as e:202 Actor.log.error(f"DEBUG: Key-value store retrieval failed for {actor_name}: {e}")203 204 Actor.log.warning(f"DEBUG: No HTML content found for {actor_name}")205 return None206 207 except Exception as e:208 Actor.log.error(f"DEBUG: Error running {actor_name}: {str(e)}")209 import traceback210 Actor.log.error(f"DEBUG: Full traceback: {traceback.format_exc()}")211 return None212 213 def _extract_html_from_item(self, item: Dict[str, Any], actor_name: str) -> Optional[str]:214 """Extract HTML content from a dataset item."""215 Actor.log.info(f"DEBUG: Extracting HTML from item for {actor_name}, item keys: {list(item.keys()) if isinstance(item, dict) else 'not a dict'}")216 217 # Look for HTML in common fields218 html_fields = ['html', 'content', 'body', 'pageContent', 'text', 'data']219 220 for field in html_fields:221 if field in item and item[field]:222 content = item[field]223 Actor.log.info(f"DEBUG: Found content in field '{field}': {type(content)}, length: {len(content) if isinstance(content, str) else 'N/A'}")224 225 if isinstance(content, str) and len(content) > 100:226 # Check if it looks like HTML227 if '<' in content and '>' in content:228 Actor.log.info(f"DEBUG: Found HTML content in field '{field}' for {actor_name}")229 return content230 elif actor_name == "website-content-crawler":231 # For website-content-crawler, text content is also acceptable232 Actor.log.info(f"DEBUG: Found text content in field '{field}' for {actor_name}")233 html_content = f"<html><body><div>{content}</div></body></html>"234 return html_content235 236 # For website-content-crawler, look for any text-like content237 if actor_name == "website-content-crawler":238 for key, value in item.items():239 if isinstance(value, str) and len(value) > 50:240 Actor.log.info(f"DEBUG: Using text content from field '{key}' for website-content-crawler")241 html_content = f"<html><body><div>{value}</div></body></html>"242 return html_content243 244 Actor.log.info(f"DEBUG: No HTML content found in item for {actor_name}")245 return None246 247 def _extract_html_from_record(self, record: Any, actor_name: str) -> Optional[str]:248 """Extract HTML content from a key-value store record."""249 try:250 # The record might be the content directly or wrapped in a dict251 content = record252 253 if hasattr(record, 'value'):254 content = record.value255 elif isinstance(record, dict) and 'value' in record:256 content = record['value']257 258 # If content is a string, check if it's HTML259 if isinstance(content, str):260 if len(content) > 100 and ('<' in content or actor_name == "website-content-crawler"):261 return content262 263 # If content is a dict, look for HTML fields264 elif isinstance(content, dict):265 html_content = self._extract_html_from_item(content, actor_name)266 if html_content:267 return html_content268 269 return None270 271 except Exception as e:272 Actor.log.debug(f"DEBUG: Error extracting HTML from record for {actor_name}: {e}")273 return None
src/llmscraper/scraping/apify_runner.py
1"""2Apify integration for running actors and retrieving results.3"""4
5import logging6from typing import Optional, Dict, Any, List, Union7
8from apify_client import ApifyClient9
10
11class ApifyRunner:12 """Handles running Apify actors and retrieving results."""13 14 def __init__(self, api_token: str):15 """Initialize with API token."""16 if not api_token or not api_token.strip():17 raise ValueError("API token cannot be empty")18 19 self.client = ApifyClient(api_token)20 self.logger = logging.getLogger(__name__)21 22 def run_actor(self, actor_id: str, actor_input: dict, 23 retrieve_from: str = "auto") -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:24 """25 Run an Apify actor and retrieve results.26 27 Args:28 actor_id: The ID of the Apify actor29 actor_input: Input configuration for the actor30 retrieve_from: "auto", "dataset", "key-value-store", or "both"31 32 Returns:33 Retrieved data or None if failed34 """35 if not actor_id or not actor_id.strip():36 raise ValueError("actor_id cannot be empty")37 38 if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:39 raise ValueError("Invalid retrieve_from option")40 41 # Determine storage type42 if retrieve_from == "auto":43 retrieve_from = "key-value-store" if "website-content-crawler" in actor_id else "dataset"44 45 try:46 self.logger.info(f"Starting Apify actor: {actor_id}")47 48 # Start the actor run49 run = self.client.actor(actor_id).call(run_input=actor_input)50 51 if not run or run.get('status') != 'SUCCEEDED':52 self.logger.error(f"Actor run failed: {run.get('status') if run else 'No run created'}")53 return None54 55 run_id = run.get('id')56 self.logger.info(f"Actor run {run_id} completed successfully")57 58 # Retrieve results based on type59 if retrieve_from == "dataset":60 return self._get_dataset_items(run_id)61 elif retrieve_from == "key-value-store":62 return self._get_key_value_store_items(run_id)63 elif retrieve_from == "both":64 return {65 "dataset": self._get_dataset_items(run_id),66 "key_value_store": self._get_key_value_store_items(run_id)67 }68 69 except Exception as e:70 self.logger.error(f"Error running actor {actor_id}: {str(e)}")71 return None72 73 def _get_dataset_items(self, run_id: str) -> List[Dict[str, Any]]:74 """Get items from the dataset of a run."""75 try:76 dataset_id = self.client.run(run_id).get().get('defaultDatasetId')77 if not dataset_id:78 self.logger.warning(f"No dataset found for run {run_id}")79 return []80 81 dataset_items = list(self.client.dataset(dataset_id).iterate_items())82 self.logger.info(f"Retrieved {len(dataset_items)} items from dataset")83 return dataset_items84 85 except Exception as e:86 self.logger.error(f"Error retrieving dataset items: {str(e)}")87 return []88 89 def _get_key_value_store_items(self, run_id: str) -> Dict[str, Any]:90 """Get items from the key-value store of a run."""91 try:92 kvs_id = self.client.run(run_id).get().get('defaultKeyValueStoreId')93 if not kvs_id:94 self.logger.warning(f"No key-value store found for run {run_id}")95 return {}96 97 kvs = self.client.key_value_store(kvs_id)98 keys = list(kvs.list_keys())99 100 items = {}101 for key_info in keys:102 # Handle case where key_info might be a string or dict103 if isinstance(key_info, dict):104 key_name = key_info.get('key')105 else:106 key_name = str(key_info)107 108 if key_name:109 try:110 value = kvs.get_record(key_name)111 if value:112 # Handle case where value might be a string or dict113 if isinstance(value, dict):114 items[key_name] = value.get('value', value)115 else:116 items[key_name] = value117 except Exception as e:118 self.logger.warning(f"Failed to retrieve key '{key_name}': {str(e)}")119 120 self.logger.info(f"Retrieved {len(items)} items from key-value store")121 return items122 123 except Exception as e:124 self.logger.error(f"Error retrieving key-value store items: {str(e)}")125 return {}126
127
128# Legacy functions for backward compatibility129def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:130 """Legacy function - use ApifyRunner class instead."""131 runner = ApifyRunner(api_token)132 result = runner.run_actor(actor_id, actor_input, "dataset")133 return result if isinstance(result, list) else None134
135
136def run_apify_actor_with_flexible_retrieval(137 actor_id: str, actor_input: dict, *, api_token: str, retrieve_from: str = "auto"138) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:139 """Legacy function - use ApifyRunner class instead."""140 runner = ApifyRunner(api_token)141 return runner.run_actor(actor_id, actor_input, retrieve_from)
src/llmscraper/scraping/multi_actor_scraper.py
1"""2Multi-actor scraping functionality.3"""4
5import logging6from typing import Dict, Any7from concurrent.futures import ThreadPoolExecutor, as_completed8
9from .apify_runner import ApifyRunner10
11
12class MultiActorScraper:13 """Scrapes websites using multiple Apify actors simultaneously."""14 15 def __init__(self, api_token: str):16 """Initialize with Apify API token."""17 self.api_token = api_token18 self.runner = ApifyRunner(api_token)19 self.logger = logging.getLogger(__name__)20 21 def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, str]:22 """23 Scrape a URL with multiple actors and return HTML content.24 25 Args:26 target_url: URL to scrape27 28 Returns:29 Dict mapping actor names to HTML content30 """31 actor_configs = self._get_actor_configs(target_url)32 results = {}33 34 # Use ThreadPoolExecutor for concurrent execution35 with ThreadPoolExecutor(max_workers=len(actor_configs)) as executor:36 future_to_actor = {37 executor.submit(self._run_single_actor, name, config): name38 for name, config in actor_configs.items()39 }40 41 for future in as_completed(future_to_actor):42 actor_name = future_to_actor[future]43 try:44 name, html_content = future.result()45 results[name] = html_content46 except Exception as e:47 self.logger.error(f"Actor {actor_name} failed: {str(e)}")48 results[actor_name] = None49 50 return results51 52 def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:53 """Get actor configurations for the target URL."""54 return {55 "cheerio-scraper": {56 "actor_id": "apify/cheerio-scraper",57 "input": {58 "startUrls": [{"url": target_url}],59 "maxRequestRetries": 3,60 "requestTimeoutSecs": 30,61 "maxRequestsPerCrawl": 1,62 "pseudoUrls": [],63 "linkSelector": "",64 "pageFunction": """65 async function pageFunction(context) {66 const { request, log, skipLinks, $ } = context;67 return {68 url: request.url,69 title: $('title').text(),70 html: $('html').html()71 };72 }73 """,74 "proxyConfiguration": {"useApifyProxy": True}75 }76 },77 "web-scraper": {78 "actor_id": "apify/web-scraper",79 "input": {80 "startUrls": [{"url": target_url}],81 "maxRequestRetries": 3,82 "requestTimeoutSecs": 30,83 "maxPagesPerCrawl": 1,84 "pageFunction": """85 async function pageFunction(context) {86 const { request, log, skipLinks, $ } = context;87 return {88 url: request.url,89 title: $('title').text(),90 html: $('html').html()91 };92 }93 """,94 "proxyConfiguration": {"useApifyProxy": True}95 }96 },97 "website-content-crawler": {98 "actor_id": "apify/website-content-crawler",99 "input": {100 "startUrls": [{"url": target_url}],101 "maxCrawlPages": 1,102 "crawler": "playwright",103 "proxyConfiguration": {"useApifyProxy": True}104 }105 }106 }107 108 def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> tuple[str, str]:109 """110 Run a single actor and extract HTML content.111 112 Args:113 actor_name: Name of the actor114 config: Actor configuration115 116 Returns:117 Tuple of (actor_name, html_content)118 """119 try:120 self.logger.info(f"Starting {actor_name}...")121 122 result = self.runner.run_actor(123 config["actor_id"],124 config["input"],125 "auto"126 )127 128 if not result:129 self.logger.warning(f"{actor_name} returned no results")130 return actor_name, None131 132 # Extract HTML based on result type133 html_content = self._extract_html_from_result(result, actor_name)134 135 if html_content:136 self.logger.info(f"{actor_name} completed successfully: {len(html_content):,} chars")137 else:138 self.logger.warning(f"{actor_name} returned no HTML content")139 140 return actor_name, html_content141 142 except Exception as e:143 self.logger.error(f"Error running {actor_name}: {str(e)}")144 return actor_name, None145 146 def _extract_html_from_result(self, result: Any, actor_name: str) -> str:147 """Extract HTML content from actor result."""148 try:149 if isinstance(result, list) and result:150 # Dataset result151 item = result[0]152 return item.get('html') or item.get('content', '')153 elif isinstance(result, dict):154 # Key-value store result155 if 'OUTPUT' in result:156 output = result['OUTPUT']157 if isinstance(output, dict):158 return output.get('html') or output.get('content', '')159 elif isinstance(output, str):160 return output161 162 self.logger.warning(f"Unexpected result format from {actor_name}")163 return None164 165 except Exception as e:166 self.logger.error(f"Error extracting HTML from {actor_name}: {str(e)}")167 return None
src/llmscraper/utils/__init__.py
1"""2Utilities module for ScraperCodeGenerator.3"""4
5from .html_utils import is_html, prune_html, extract_text_content, validate_html_structure6from .config import get_api_key, validate_required_keys, setup_logging7
8__all__ = [9 "is_html",10 "prune_html", 11 "extract_text_content",12 "validate_html_structure",13 "get_api_key",14 "validate_required_keys",15 "setup_logging"16]
src/llmscraper/utils/config.py
1"""2Configuration and environment utilities.3"""4
5import os6from typing import Optional7
8
9def get_api_key(key_name: str, provided_key: Optional[str] = None) -> Optional[str]:10 """11 Get API key from provided value or environment variable.12 13 Args:14 key_name: Name of the environment variable15 provided_key: Explicitly provided key (takes precedence)16 17 Returns:18 API key or None if not found19 """20 if provided_key and provided_key.strip():21 return provided_key.strip()22 23 return os.getenv(key_name)24
25
26def validate_required_keys(**keys) -> dict[str, str]:27 """28 Validate that all required API keys are present.29 30 Args:31 **keys: Key-value pairs of key names and values32 33 Returns:34 Dict of validated keys35 36 Raises:37 ValueError: If any required key is missing38 """39 validated = {}40 missing = []41 42 for key_name, key_value in keys.items():43 if not key_value or not key_value.strip():44 missing.append(key_name)45 else:46 validated[key_name] = key_value.strip()47 48 if missing:49 raise ValueError(f"Missing required API keys: {', '.join(missing)}")50 51 return validated52
53
54def setup_logging(level: str = "INFO") -> None:55 """56 Setup logging configuration.57 58 Args:59 level: Logging level (DEBUG, INFO, WARNING, ERROR)60 """61 import logging62 63 logging.basicConfig(64 level=getattr(logging, level.upper()),65 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',66 handlers=[67 logging.StreamHandler()68 ]69 )
src/llmscraper/utils/html_utils.py
1"""2HTML utility functions for processing web content.3"""4
5from typing import Optional6from bs4 import BeautifulSoup, Comment, NavigableString7import re8
9
10def is_html(text_content: str) -> bool:11 """12 Check if a string is likely HTML content.13 14 Args:15 text_content: The text content to check16 17 Returns:18 True if the content appears to be HTML19 """20 if not text_content or not isinstance(text_content, str):21 return False22 23 content_lower = text_content.lower()24 return '<html>' in content_lower and '<body>' in content_lower25
26
27def prune_html(html_content: str, max_list_items: int = 5, max_text_length: int = 500) -> str:28 """29 Clean and shorten HTML content to reduce token count while preserving structure.30 31 Args:32 html_content: The raw HTML content to process33 max_list_items: Maximum number of list items to keep34 max_text_length: Maximum length of text content in any tag35 36 Returns:37 The cleaned and shortened HTML38 """39 if not html_content or not isinstance(html_content, str):40 return ""41 42 try:43 soup = BeautifulSoup(html_content, 'html.parser')44 45 # Remove unwanted tags entirely46 unwanted_tags = ['script', 'style', 'svg', 'noscript', 'meta', 'link']47 for tag_name in unwanted_tags:48 for tag in soup.find_all(tag_name):49 tag.decompose()50 51 # Remove HTML comments52 for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):53 comment.extract()54 55 # Remove unwanted attributes from all tags56 allowed_attributes = {'id', 'class', 'href', 'src', 'alt', 'title'}57 for tag in soup.find_all(True):58 if hasattr(tag, 'attrs'):59 tag.attrs = {key: value for key, value in tag.attrs.items() 60 if key in allowed_attributes}61 62 # Truncate lists and tables63 list_and_table_tags = ['ul', 'ol', 'table', 'tbody', 'thead']64 for tag_name in list_and_table_tags:65 for tag in soup.find_all(tag_name):66 children = list(tag.children)67 # Filter out NavigableString objects (text nodes, whitespace)68 non_text_children = [child for child in children if not isinstance(child, NavigableString)]69 70 if len(non_text_children) > max_list_items:71 # Keep only the first max_list_items children72 for child in non_text_children[max_list_items:]:73 child.decompose()74 75 # Add a comment indicating truncation76 if tag.name in ['ul', 'ol']:77 truncation_notice = soup.new_tag("li")78 truncation_notice.string = f"... ({len(non_text_children) - max_list_items} more items)"79 tag.append(truncation_notice)80 elif tag.name == 'table':81 truncation_notice = soup.new_tag("tr")82 td = soup.new_tag("td")83 td.string = f"... ({len(non_text_children) - max_list_items} more rows)"84 truncation_notice.append(td)85 tag.append(truncation_notice)86 87 # Truncate long text content88 for element in soup.find_all(string=True):89 if isinstance(element, NavigableString) and not isinstance(element, Comment):90 text = str(element).strip()91 if len(text) > max_text_length:92 element.replace_with(text[:max_text_length] + "...")93 94 # Return the cleaned HTML95 return str(soup)96 97 except Exception as e:98 # If parsing fails, return original content truncated99 return html_content[:max_text_length * 10] if len(html_content) > max_text_length * 10 else html_content100
101
102def extract_text_content(html_content: str) -> str:103 """104 Extract clean text content from HTML.105 106 Args:107 html_content: HTML content to extract text from108 109 Returns:110 Clean text content111 """112 if not html_content:113 return ""114 115 try:116 soup = BeautifulSoup(html_content, 'html.parser')117 return soup.get_text(separator=' ', strip=True)118 except Exception:119 return html_content120
121
122def validate_html_structure(html_content: str) -> bool:123 """124 Validate basic HTML structure.125 126 Args:127 html_content: HTML content to validate128 129 Returns:130 True if HTML has basic valid structure131 """132 if not html_content:133 return False134 135 try:136 soup = BeautifulSoup(html_content, 'html.parser')137 138 # Check for basic HTML elements139 has_html_tag = soup.find('html') is not None140 has_body_tag = soup.find('body') is not None141 has_content = len(soup.get_text(strip=True)) > 0142 143 return has_html_tag or has_body_tag or has_content144 145 except Exception:146 return False