Flow AI is a platform for building, deploying, and monetizing AI agents tailored for the Web3 ecosystem and their community. It enables users to gather insights from onchain and offchain data and run complex transactions.

Flow AI

5.0

Natural Language Dataset Query

apify/natural-language-dataset-query

Use natural language queries to retrieve results from an Apify dataset. This Actor provides a query engine that loads a dataset, executes SQL queries, and synthesizes results. It works as an MCP (Model Context Protocol) server or REST API in Actor standby mode.

Apify

Lara Translate MCP Server

mcp-servers/lara-translate-mcp-server

The Lara Translate MCP Server handles translation requests and manages multilingual processing for Lara Translate services.

MCP servers

MCP Stress Tester

jakub.kopecky/mcp-stress-tester

A simple MCP Stress Tester client Actor for stress-testing your Model Context Protocol server. 💻⚡

Jakub Kopecký

Calculator MCP server

mcp-servers/calculator-mcp-server

A Model Context Protocol server for calculating. This server enables LLMs to use calculator for precise numerical calculations.

MCP servers

Playwright MCP Server

jiri.spilka/playwright-mcp-server

A Model Context Protocol (MCP) server that provides browser automation capabilities using Playwright

Jiří Spilka

Firecrawl MCP Server

mcp-servers/firecrawl-mcp-server

A Model Context Protocol (MCP) server implementation that integrates with Firecrawl MCP for web scraping capabilities

MCP servers

Cars Com Unlimited

oodoow/cars-com-unlimited

HonzaS

Mastra.ai MCP Agent

jakub.kopecky/actor-mastra-mcp-agent

🤖 AI agent using mastra.ai with Apify MCP Server. 🚀 Runs queries via OpenAI models, taps Apify Actors for web data, and outputs to datasets. 🛠️

Jakub Kopecký

Apify Email Signature Generator

apify/email-signature-generator

Generates a standardized email signature for Apify team members, using provided details.

Apify

111

4.3

What is web scraping?

Best web scraping APIs in 2025

Pros and cons of web scraping

# --- General ---
.DS_Store
.env
.env.*

# --- Logs ---
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*

# --- IDEs ---
.vscode/*
!.vscode/extensions.json
.idea/
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?

# --- Python ---
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
.venv/
venv/
ENV/
env/
.env
.env.*
.pytest_cache/
.coverage
htmlcov/
.tox/
.cache
.mypy_cache/
.dmypy.json
dmypy.json

# Project specific
scraped_results/
*.html

# --- Node.js / Frontend ---
frontend/node_modules/
frontend/dist/
frontend/dist-ssr/
frontend/.pnp
frontend/.pnp.js
frontend/.npm
node_modules
dist
dist-ssr
*.local
# Added by Apify CLI
storage
.venv

# --- Apify ---
storage/
apify_storage/

# --- Local test files ---
input.json
test_*

.python-version

3.10

Dockerfile

# Use the official Apify Python base image
FROM apify/actor-python:3.11

# Copy requirements and install dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt

# Copy the source code
COPY . ./

# Set the entrypoint
CMD ["python", "main.py"]

cli.py

1#!/usr/bin/env python3
2"""
3CLI Entry Point for LLMScraper
4"""
5
6import sys
7import os
8
9# Add src to path for development
10sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
11
12from llmscraper.main import main
13
14if __name__ == "__main__":
15    sys.exit(main())

main.py

1"""
2Apify Actor entry point for LLMScraper.
3"""
4
5import asyncio
6import os
7import sys
8
9# Add src to path
10sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
11
12from apify import Actor
13from llmscraper import run_intelligent_scraper
14
15
16async def main():
17    """Main actor function."""
18    async with Actor:
19        Actor.log.info("DEBUG: Inside Actor context")
20        Actor.log.info("🚀 LLMScraper Actor starting...")
21        
22        # Get input
23        actor_input = await Actor.get_input() or {}
24        
25        # Extract parameters
26        target_url = actor_input.get('targetUrl')
27        user_goal = actor_input.get('userGoal')
28        claude_api_key = actor_input.get('claudeApiKey') 
29        
30        # Use actor token for Apify operations
31        apify_token = Actor.config.token
32        
33        if not target_url:
34            raise ValueError("targetUrl is required")
35        
36        if not user_goal:
37            raise ValueError("userGoal is required")
38            
39        if not claude_api_key:
40            raise ValueError("claudeApiKey is required")
41        
42        Actor.log.info(f"Target URL: {target_url}")
43        Actor.log.info(f"User Goal: {user_goal}")
44        
45        Actor.log.info("DEBUG: About to call run_intelligent_scraper")
46        
47        try:
48            # Run the intelligent scraper
49            Actor.log.info("DEBUG: About to call run_intelligent_scraper with parameters:")
50            Actor.log.info(f"  - target_url: {target_url}")
51            Actor.log.info(f"  - user_goal: {user_goal}")
52            Actor.log.info(f"  - has claude_api_key: {bool(claude_api_key)}")
53            Actor.log.info(f"  - has apify_token: {bool(apify_token)}")
54            
55            results = await run_intelligent_scraper(
56                target_url=target_url,
57                user_goal=user_goal,
58                claude_api_key=claude_api_key,
59                apify_token=apify_token,
60                for_actor=True,  # Set for actor mode
61                actor_logger=Actor.log,  # Pass Actor logger
62                test_script=True  # Enable testing in actor mode
63            )
64            
65            Actor.log.info("DEBUG: run_intelligent_scraper completed")
66            Actor.log.info(f"DEBUG: Results type: {type(results)}")
67            Actor.log.info(f"DEBUG: Results success: {getattr(results, 'success', 'No success attr')}")
68            
69            if hasattr(results, 'error_message') and results.error_message:
70                Actor.log.error(f"DEBUG: Results error: {results.error_message}")
71            
72            if hasattr(results, 'quality_scores') and results.quality_scores:
73                Actor.log.info(f"DEBUG: Quality scores: {results.quality_scores}")
74                
75            if hasattr(results, 'best_actor') and results.best_actor:
76                Actor.log.info(f"DEBUG: Best actor: {results.best_actor}")
77            
78            if hasattr(results, 'generated_script') and results.generated_script:
79                Actor.log.info(f"DEBUG: Generated script length: {len(results.generated_script)} characters")
80            
81            if hasattr(results, 'extracted_data') and results.extracted_data:
82                Actor.log.info(f"DEBUG: Test extraction successful: {len(results.extracted_data) if isinstance(results.extracted_data, list) else 1} items")
83            
84            if results.success:
85                Actor.log.info(f"Scraping completed successfully!")
86                
87                # Show the generated script in actor mode
88                if hasattr(results, 'generated_script') and results.generated_script:
89                    Actor.log.info("\\n" + "="*60)
90                    Actor.log.info("📄 GENERATED ACTOR SCRIPT")
91                    Actor.log.info("="*60)
92                    Actor.log.info(results.generated_script)
93                    Actor.log.info("="*60)
94                
95                # Create output data
96                output_data = {
97                    "url": target_url,
98                    "title": getattr(results, 'title', 'N/A'),
99                    "data": results.extracted_data if hasattr(results, 'extracted_data') else {},
100                    "generated_script": getattr(results, 'generated_script', ''),
101                    "best_actor": getattr(results, 'best_actor', ''),
102                    "quality_scores": getattr(results, 'quality_scores', {}),
103                    "timestamp": "2025-06-29T20:00:00Z",  # You might want to use actual timestamp
104                    "success": True,
105                    "error": None
106                }
107                
108                # Push results to default dataset
109                await Actor.push_data([output_data])
110            else:
111                Actor.log.error(f"Scraping failed: {results.error_message}")
112                
113                # Still push error data for tracking
114                error_data = {
115                    "url": target_url,
116                    "title": "N/A",
117                    "data": {},
118                    "timestamp": "2025-06-29T20:00:00Z",
119                    "success": False,
120                    "error": results.error_message
121                }
122                
123                await Actor.push_data([error_data])
124            
125            Actor.log.info("✅ LLMScraper Actor finished successfully!")
126            
127        except Exception as e:
128            Actor.log.error(f"❌ Error during scraping: {str(e)}")
129            raise
130
131
132if __name__ == "__main__":
133    asyncio.run(main())

package.json

{
  "name": "scraper-code-generator",
  "version": "0.1.0",
  "description": "Intelligent web scraping framework using AI-powered quality evaluation",
  "main": "main.py",
  "scripts": {
    "start": "python main.py"
  },
  "dependencies": {},
  "author": "",
  "license": "MIT"
}

pyproject.toml

1[build-system]
2requires = ["hatchling"]
3build-backend = "hatchling.build"
4
5[project]
6name = "llmscraper"
7version = "0.1.0"
8description = "Intelligent web scraping framework using AI-powered quality evaluation and multiple scraping strategies"
9readme = "README.md"
10requires-python = ">=3.10"
11dependencies = [
12    "anthropic>=0.54.0",
13    "apify-client>=1.11.0", 
14    "beautifulsoup4>=4.12.0",
15    "apify>=1.5.0",
16]
17
18[project.scripts]
19llmscraper = "llmscraper.main:main"
20
21[project.optional-dependencies]
22dev = [
23    "pytest>=7.0",
24    "pytest-asyncio>=0.21.0",
25    "black>=23.0",
26    "isort>=5.12.0",
27    "mypy>=1.5.0",
28]
29
30[tool.hatch.build.targets.wheel]
31packages = ["src/llmscraper"]
32
33[tool.hatch.build.targets.sdist]
34include = [
35    "/src",
36    "/tests", 
37    "/examples.json",
38    "/README.md",
39]

requirements.txt

1apify>=1.5.0
2apify-client>=1.5.0
3anthropic>=0.7.0
4beautifulsoup4>=4.12.0

uv.lock

version = 1
revision = 2
requires-python = ">=3.10"

[[package]]
name = "annotated-types"
version = "0.7.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
]

[[package]]
name = "anthropic"
version = "0.54.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "anyio" },
    { name = "distro" },
    { name = "httpx" },
    { name = "jiter" },
    { name = "pydantic" },
    { name = "sniffio" },
    { name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/89/28/80cb9bb6e7ce77d404145b51da4257455805c17f0a6be528ff3286e3882f/anthropic-0.54.0.tar.gz", hash = "sha256:5e6f997d97ce8e70eac603c3ec2e7f23addeff953fbbb76b19430562bb6ba815", size = 312376, upload-time = "2025-06-11T02:46:27.642Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/de/b9/6ffb48e82c5e97b03cecee872d134a6b6666c2767b2d32ed709f3a60a8fe/anthropic-0.54.0-py3-none-any.whl", hash = "sha256:c1062a0a905daeec17ca9c06c401e4b3f24cb0495841d29d752568a1d4018d56", size = 288774, upload-time = "2025-06-11T02:46:25.578Z" },
]

[[package]]
name = "anyio"
version = "4.9.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
    { name = "idna" },
    { name = "sniffio" },
    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/95/7d/4c1bd541d4dffa1b52bd83fb8527089e097a106fc90b467a7313b105f840/anyio-4.9.0.tar.gz", hash = "sha256:673c0c244e15788651a4ff38710fea9675823028a6f08a5eda409e0c9840a028", size = 190949, upload-time = "2025-03-17T00:02:54.77Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" },
]

[[package]]
name = "apify-client"
version = "1.11.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "apify-shared" },
    { name = "colorama" },
    { name = "httpx" },
    { name = "more-itertools" },
]
sdist = { url = "https://files.pythonhosted.org/packages/49/44/b7cae857f2129d4093bc5a0a2267fcbba7905207a0b7cc424dc3c7c90291/apify_client-1.11.0.tar.gz", hash = "sha256:c2e151754c35be9bc7c1028bf7cb127aeb1ffa2fbd1ec1ad7e97b901deb32e08", size = 346095, upload-time = "2025-06-13T11:46:39.129Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/8f/24/d3273bfe5b4a96fd60c8d554edbab99274fae8cb2347b96f2e3fa0bc4d5b/apify_client-1.11.0-py3-none-any.whl", hash = "sha256:9d691960bdbeee17624a2a82aafc4f0bfba9b48820a48f559b7eba76bf01cb3c", size = 82550, upload-time = "2025-06-13T11:46:37.483Z" },
]

[[package]]
name = "apify-shared"
version = "1.4.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b2/a6/c8e2fa0b3bdc479d3ecde778e2381af199f910cf7c8baa3c207bcfe26e47/apify_shared-1.4.1.tar.gz", hash = "sha256:16e617c840fd27bf38d980f079c0b867c7378f68c7006b3d5a7d530d43930507", size = 13871, upload-time = "2025-04-28T12:20:01.113Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/1d/f3/3446c8a7986fdc087024d4e174e4b3f587097a9b28f6f8e8c788199225b2/apify_shared-1.4.1-py3-none-any.whl", hash = "sha256:abac5712b6e8eb96693204cbb2702905e1971d9084b1716e7337852b5005290e", size = 12706, upload-time = "2025-04-28T12:19:59.792Z" },
]

[[package]]
name = "certifi"
version = "2025.6.15"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" },
]

[[package]]
name = "colorama"
version = "0.4.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]

[[package]]
name = "distro"
version = "1.9.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
]

[[package]]
name = "exceptiongroup"
version = "1.3.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },
]

[[package]]
name = "h11"
version = "0.16.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
]

[[package]]
name = "httpcore"
version = "1.0.9"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "certifi" },
    { name = "h11" },
]
sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
]

[[package]]
name = "httpx"
version = "0.28.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "anyio" },
    { name = "certifi" },
    { name = "httpcore" },
    { name = "idna" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
]

[[package]]
name = "idna"
version = "3.10"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" },
]

[[package]]
name = "jiter"
version = "0.10.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ee/9d/ae7ddb4b8ab3fb1b51faf4deb36cb48a4fbbd7cb36bad6a5fca4741306f7/jiter-0.10.0.tar.gz", hash = "sha256:07a7142c38aacc85194391108dc91b5b57093c978a9932bd86a36862759d9500", size = 162759, upload-time = "2025-05-18T19:04:59.73Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/be/7e/4011b5c77bec97cb2b572f566220364e3e21b51c48c5bd9c4a9c26b41b67/jiter-0.10.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:cd2fb72b02478f06a900a5782de2ef47e0396b3e1f7d5aba30daeb1fce66f303", size = 317215, upload-time = "2025-05-18T19:03:04.303Z" },
    { url = "https://files.pythonhosted.org/packages/8a/4f/144c1b57c39692efc7ea7d8e247acf28e47d0912800b34d0ad815f6b2824/jiter-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:32bb468e3af278f095d3fa5b90314728a6916d89ba3d0ffb726dd9bf7367285e", size = 322814, upload-time = "2025-05-18T19:03:06.433Z" },
    { url = "https://files.pythonhosted.org/packages/63/1f/db977336d332a9406c0b1f0b82be6f71f72526a806cbb2281baf201d38e3/jiter-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8b3e0068c26ddedc7abc6fac37da2d0af16b921e288a5a613f4b86f050354f", size = 345237, upload-time = "2025-05-18T19:03:07.833Z" },
    { url = "https://files.pythonhosted.org/packages/d7/1c/aa30a4a775e8a672ad7f21532bdbfb269f0706b39c6ff14e1f86bdd9e5ff/jiter-0.10.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:286299b74cc49e25cd42eea19b72aa82c515d2f2ee12d11392c56d8701f52224", size = 370999, upload-time = "2025-05-18T19:03:09.338Z" },
    { url = "https://files.pythonhosted.org/packages/35/df/f8257abc4207830cb18880781b5f5b716bad5b2a22fb4330cfd357407c5b/jiter-0.10.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ed5649ceeaeffc28d87fb012d25a4cd356dcd53eff5acff1f0466b831dda2a7", size = 491109, upload-time = "2025-05-18T19:03:11.13Z" },
    { url = "https://files.pythonhosted.org/packages/06/76/9e1516fd7b4278aa13a2cc7f159e56befbea9aa65c71586305e7afa8b0b3/jiter-0.10.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2ab0051160cb758a70716448908ef14ad476c3774bd03ddce075f3c1f90a3d6", size = 388608, upload-time = "2025-05-18T19:03:12.911Z" },
    { url = "https://files.pythonhosted.org/packages/6d/64/67750672b4354ca20ca18d3d1ccf2c62a072e8a2d452ac3cf8ced73571ef/jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03997d2f37f6b67d2f5c475da4412be584e1cec273c1cfc03d642c46db43f8cf", size = 352454, upload-time = "2025-05-18T19:03:14.741Z" },
    { url = "https://files.pythonhosted.org/packages/96/4d/5c4e36d48f169a54b53a305114be3efa2bbffd33b648cd1478a688f639c1/jiter-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c404a99352d839fed80d6afd6c1d66071f3bacaaa5c4268983fc10f769112e90", size = 391833, upload-time = "2025-05-18T19:03:16.426Z" },
    { url = "https://files.pythonhosted.org/packages/0b/de/ce4a6166a78810bd83763d2fa13f85f73cbd3743a325469a4a9289af6dae/jiter-0.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66e989410b6666d3ddb27a74c7e50d0829704ede652fd4c858e91f8d64b403d0", size = 523646, upload-time = "2025-05-18T19:03:17.704Z" },
    { url = "https://files.pythonhosted.org/packages/a2/a6/3bc9acce53466972964cf4ad85efecb94f9244539ab6da1107f7aed82934/jiter-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b532d3af9ef4f6374609a3bcb5e05a1951d3bf6190dc6b176fdb277c9bbf15ee", size = 514735, upload-time = "2025-05-18T19:03:19.44Z" },
    { url = "https://files.pythonhosted.org/packages/b4/d8/243c2ab8426a2a4dea85ba2a2ba43df379ccece2145320dfd4799b9633c5/jiter-0.10.0-cp310-cp310-win32.whl", hash = "sha256:da9be20b333970e28b72edc4dff63d4fec3398e05770fb3205f7fb460eb48dd4", size = 210747, upload-time = "2025-05-18T19:03:21.184Z" },
    { url = "https://files.pythonhosted.org/packages/37/7a/8021bd615ef7788b98fc76ff533eaac846322c170e93cbffa01979197a45/jiter-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:f59e533afed0c5b0ac3eba20d2548c4a550336d8282ee69eb07b37ea526ee4e5", size = 207484, upload-time = "2025-05-18T19:03:23.046Z" },
    { url = "https://files.pythonhosted.org/packages/1b/dd/6cefc6bd68b1c3c979cecfa7029ab582b57690a31cd2f346c4d0ce7951b6/jiter-0.10.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3bebe0c558e19902c96e99217e0b8e8b17d570906e72ed8a87170bc290b1e978", size = 317473, upload-time = "2025-05-18T19:03:25.942Z" },
    { url = "https://files.pythonhosted.org/packages/be/cf/fc33f5159ce132be1d8dd57251a1ec7a631c7df4bd11e1cd198308c6ae32/jiter-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:558cc7e44fd8e507a236bee6a02fa17199ba752874400a0ca6cd6e2196cdb7dc", size = 321971, upload-time = "2025-05-18T19:03:27.255Z" },
    { url = "https://files.pythonhosted.org/packages/68/a4/da3f150cf1d51f6c472616fb7650429c7ce053e0c962b41b68557fdf6379/jiter-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d613e4b379a07d7c8453c5712ce7014e86c6ac93d990a0b8e7377e18505e98d", size = 345574, upload-time = "2025-05-18T19:03:28.63Z" },
    { url = "https://files.pythonhosted.org/packages/84/34/6e8d412e60ff06b186040e77da5f83bc158e9735759fcae65b37d681f28b/jiter-0.10.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f62cf8ba0618eda841b9bf61797f21c5ebd15a7a1e19daab76e4e4b498d515b2", size = 371028, upload-time = "2025-05-18T19:03:30.292Z" },
    { url = "https://files.pythonhosted.org/packages/fb/d9/9ee86173aae4576c35a2f50ae930d2ccb4c4c236f6cb9353267aa1d626b7/jiter-0.10.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:919d139cdfa8ae8945112398511cb7fca58a77382617d279556b344867a37e61", size = 491083, upload-time = "2025-05-18T19:03:31.654Z" },
    { url = "https://files.pythonhosted.org/packages/d9/2c/f955de55e74771493ac9e188b0f731524c6a995dffdcb8c255b89c6fb74b/jiter-0.10.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ddbc6ae311175a3b03bd8994881bc4635c923754932918e18da841632349db", size = 388821, upload-time = "2025-05-18T19:03:33.184Z" },
    { url = "https://files.pythonhosted.org/packages/81/5a/0e73541b6edd3f4aada586c24e50626c7815c561a7ba337d6a7eb0a915b4/jiter-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c440ea003ad10927a30521a9062ce10b5479592e8a70da27f21eeb457b4a9c5", size = 352174, upload-time = "2025-05-18T19:03:34.965Z" },
    { url = "https://files.pythonhosted.org/packages/1c/c0/61eeec33b8c75b31cae42be14d44f9e6fe3ac15a4e58010256ac3abf3638/jiter-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc347c87944983481e138dea467c0551080c86b9d21de6ea9306efb12ca8f606", size = 391869, upload-time = "2025-05-18T19:03:36.436Z" },
    { url = "https://files.pythonhosted.org/packages/41/22/5beb5ee4ad4ef7d86f5ea5b4509f680a20706c4a7659e74344777efb7739/jiter-0.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:13252b58c1f4d8c5b63ab103c03d909e8e1e7842d302473f482915d95fefd605", size = 523741, upload-time = "2025-05-18T19:03:38.168Z" },
    { url = "https://files.pythonhosted.org/packages/ea/10/768e8818538e5817c637b0df52e54366ec4cebc3346108a4457ea7a98f32/jiter-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d1bbf3c465de4a24ab12fb7766a0003f6f9bce48b8b6a886158c4d569452dc5", size = 514527, upload-time = "2025-05-18T19:03:39.577Z" },
    { url = "https://files.pythonhosted.org/packages/73/6d/29b7c2dc76ce93cbedabfd842fc9096d01a0550c52692dfc33d3cc889815/jiter-0.10.0-cp311-cp311-win32.whl", hash = "sha256:db16e4848b7e826edca4ccdd5b145939758dadf0dc06e7007ad0e9cfb5928ae7", size = 210765, upload-time = "2025-05-18T19:03:41.271Z" },
    { url = "https://files.pythonhosted.org/packages/c2/c9/d394706deb4c660137caf13e33d05a031d734eb99c051142e039d8ceb794/jiter-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c9c1d5f10e18909e993f9641f12fe1c77b3e9b533ee94ffa970acc14ded3812", size = 209234, upload-time = "2025-05-18T19:03:42.918Z" },
    { url = "https://files.pythonhosted.org/packages/6d/b5/348b3313c58f5fbfb2194eb4d07e46a35748ba6e5b3b3046143f3040bafa/jiter-0.10.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1e274728e4a5345a6dde2d343c8da018b9d4bd4350f5a472fa91f66fda44911b", size = 312262, upload-time = "2025-05-18T19:03:44.637Z" },
    { url = "https://files.pythonhosted.org/packages/9c/4a/6a2397096162b21645162825f058d1709a02965606e537e3304b02742e9b/jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7202ae396446c988cb2a5feb33a543ab2165b786ac97f53b59aafb803fef0744", size = 320124, upload-time = "2025-05-18T19:03:46.341Z" },
    { url = "https://files.pythonhosted.org/packages/2a/85/1ce02cade7516b726dd88f59a4ee46914bf79d1676d1228ef2002ed2f1c9/jiter-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23ba7722d6748b6920ed02a8f1726fb4b33e0fd2f3f621816a8b486c66410ab2", size = 345330, upload-time = "2025-05-18T19:03:47.596Z" },
    { url = "https://files.pythonhosted.org/packages/75/d0/bb6b4f209a77190ce10ea8d7e50bf3725fc16d3372d0a9f11985a2b23eff/jiter-0.10.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:371eab43c0a288537d30e1f0b193bc4eca90439fc08a022dd83e5e07500ed026", size = 369670, upload-time = "2025-05-18T19:03:49.334Z" },
    { url = "https://files.pythonhosted.org/packages/a0/f5/a61787da9b8847a601e6827fbc42ecb12be2c925ced3252c8ffcb56afcaf/jiter-0.10.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c675736059020365cebc845a820214765162728b51ab1e03a1b7b3abb70f74c", size = 489057, upload-time = "2025-05-18T19:03:50.66Z" },
    { url = "https://files.pythonhosted.org/packages/12/e4/6f906272810a7b21406c760a53aadbe52e99ee070fc5c0cb191e316de30b/jiter-0.10.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c5867d40ab716e4684858e4887489685968a47e3ba222e44cde6e4a2154f959", size = 389372, upload-time = "2025-05-18T19:03:51.98Z" },
    { url = "https://files.pythonhosted.org/packages/e2/ba/77013b0b8ba904bf3762f11e0129b8928bff7f978a81838dfcc958ad5728/jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395bb9a26111b60141757d874d27fdea01b17e8fac958b91c20128ba8f4acc8a", size = 352038, upload-time = "2025-05-18T19:03:53.703Z" },
    { url = "https://files.pythonhosted.org/packages/67/27/c62568e3ccb03368dbcc44a1ef3a423cb86778a4389e995125d3d1aaa0a4/jiter-0.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6842184aed5cdb07e0c7e20e5bdcfafe33515ee1741a6835353bb45fe5d1bd95", size = 391538, upload-time = "2025-05-18T19:03:55.046Z" },
    { url = "https://files.pythonhosted.org/packages/c0/72/0d6b7e31fc17a8fdce76164884edef0698ba556b8eb0af9546ae1a06b91d/jiter-0.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:62755d1bcea9876770d4df713d82606c8c1a3dca88ff39046b85a048566d56ea", size = 523557, upload-time = "2025-05-18T19:03:56.386Z" },
    { url = "https://files.pythonhosted.org/packages/2f/09/bc1661fbbcbeb6244bd2904ff3a06f340aa77a2b94e5a7373fd165960ea3/jiter-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:533efbce2cacec78d5ba73a41756beff8431dfa1694b6346ce7af3a12c42202b", size = 514202, upload-time = "2025-05-18T19:03:57.675Z" },
    { url = "https://files.pythonhosted.org/packages/1b/84/5a5d5400e9d4d54b8004c9673bbe4403928a00d28529ff35b19e9d176b19/jiter-0.10.0-cp312-cp312-win32.whl", hash = "sha256:8be921f0cadd245e981b964dfbcd6fd4bc4e254cdc069490416dd7a2632ecc01", size = 211781, upload-time = "2025-05-18T19:03:59.025Z" },
    { url = "https://files.pythonhosted.org/packages/9b/52/7ec47455e26f2d6e5f2ea4951a0652c06e5b995c291f723973ae9e724a65/jiter-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:a7c7d785ae9dda68c2678532a5a1581347e9c15362ae9f6e68f3fdbfb64f2e49", size = 206176, upload-time = "2025-05-18T19:04:00.305Z" },
    { url = "https://files.pythonhosted.org/packages/2e/b0/279597e7a270e8d22623fea6c5d4eeac328e7d95c236ed51a2b884c54f70/jiter-0.10.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0588107ec8e11b6f5ef0e0d656fb2803ac6cf94a96b2b9fc675c0e3ab5e8644", size = 311617, upload-time = "2025-05-18T19:04:02.078Z" },
    { url = "https://files.pythonhosted.org/packages/91/e3/0916334936f356d605f54cc164af4060e3e7094364add445a3bc79335d46/jiter-0.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cafc4628b616dc32530c20ee53d71589816cf385dd9449633e910d596b1f5c8a", size = 318947, upload-time = "2025-05-18T19:04:03.347Z" },
    { url = "https://files.pythonhosted.org/packages/6a/8e/fd94e8c02d0e94539b7d669a7ebbd2776e51f329bb2c84d4385e8063a2ad/jiter-0.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:520ef6d981172693786a49ff5b09eda72a42e539f14788124a07530f785c3ad6", size = 344618, upload-time = "2025-05-18T19:04:04.709Z" },
    { url = "https://files.pythonhosted.org/packages/6f/b0/f9f0a2ec42c6e9c2e61c327824687f1e2415b767e1089c1d9135f43816bd/jiter-0.10.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:554dedfd05937f8fc45d17ebdf298fe7e0c77458232bcb73d9fbbf4c6455f5b3", size = 368829, upload-time = "2025-05-18T19:04:06.912Z" },
    { url = "https://files.pythonhosted.org/packages/e8/57/5bbcd5331910595ad53b9fd0c610392ac68692176f05ae48d6ce5c852967/jiter-0.10.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bc299da7789deacf95f64052d97f75c16d4fc8c4c214a22bf8d859a4288a1c2", size = 491034, upload-time = "2025-05-18T19:04:08.222Z" },
    { url = "https://files.pythonhosted.org/packages/9b/be/c393df00e6e6e9e623a73551774449f2f23b6ec6a502a3297aeeece2c65a/jiter-0.10.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5161e201172de298a8a1baad95eb85db4fb90e902353b1f6a41d64ea64644e25", size = 388529, upload-time = "2025-05-18T19:04:09.566Z" },
    { url = "https://files.pythonhosted.org/packages/42/3e/df2235c54d365434c7f150b986a6e35f41ebdc2f95acea3036d99613025d/jiter-0.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e2227db6ba93cb3e2bf67c87e594adde0609f146344e8207e8730364db27041", size = 350671, upload-time = "2025-05-18T19:04:10.98Z" },
    { url = "https://files.pythonhosted.org/packages/c6/77/71b0b24cbcc28f55ab4dbfe029f9a5b73aeadaba677843fc6dc9ed2b1d0a/jiter-0.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:15acb267ea5e2c64515574b06a8bf393fbfee6a50eb1673614aa45f4613c0cca", size = 390864, upload-time = "2025-05-18T19:04:12.722Z" },
    { url = "https://files.pythonhosted.org/packages/6a/d3/ef774b6969b9b6178e1d1e7a89a3bd37d241f3d3ec5f8deb37bbd203714a/jiter-0.10.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:901b92f2e2947dc6dfcb52fd624453862e16665ea909a08398dde19c0731b7f4", size = 522989, upload-time = "2025-05-18T19:04:14.261Z" },
    { url = "https://files.pythonhosted.org/packages/0c/41/9becdb1d8dd5d854142f45a9d71949ed7e87a8e312b0bede2de849388cb9/jiter-0.10.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d0cb9a125d5a3ec971a094a845eadde2db0de85b33c9f13eb94a0c63d463879e", size = 513495, upload-time = "2025-05-18T19:04:15.603Z" },
    { url = "https://files.pythonhosted.org/packages/9c/36/3468e5a18238bdedae7c4d19461265b5e9b8e288d3f86cd89d00cbb48686/jiter-0.10.0-cp313-cp313-win32.whl", hash = "sha256:48a403277ad1ee208fb930bdf91745e4d2d6e47253eedc96e2559d1e6527006d", size = 211289, upload-time = "2025-05-18T19:04:17.541Z" },
    { url = "https://files.pythonhosted.org/packages/7e/07/1c96b623128bcb913706e294adb5f768fb7baf8db5e1338ce7b4ee8c78ef/jiter-0.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:75f9eb72ecb640619c29bf714e78c9c46c9c4eaafd644bf78577ede459f330d4", size = 205074, upload-time = "2025-05-18T19:04:19.21Z" },
    { url = "https://files.pythonhosted.org/packages/54/46/caa2c1342655f57d8f0f2519774c6d67132205909c65e9aa8255e1d7b4f4/jiter-0.10.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:28ed2a4c05a1f32ef0e1d24c2611330219fed727dae01789f4a335617634b1ca", size = 318225, upload-time = "2025-05-18T19:04:20.583Z" },
    { url = "https://files.pythonhosted.org/packages/43/84/c7d44c75767e18946219ba2d703a5a32ab37b0bc21886a97bc6062e4da42/jiter-0.10.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a4c418b1ec86a195f1ca69da8b23e8926c752b685af665ce30777233dfe070", size = 350235, upload-time = "2025-05-18T19:04:22.363Z" },
    { url = "https://files.pythonhosted.org/packages/01/16/f5a0135ccd968b480daad0e6ab34b0c7c5ba3bc447e5088152696140dcb3/jiter-0.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d7bfed2fe1fe0e4dda6ef682cee888ba444b21e7a6553e03252e4feb6cf0adca", size = 207278, upload-time = "2025-05-18T19:04:23.627Z" },
    { url = "https://files.pythonhosted.org/packages/1c/9b/1d646da42c3de6c2188fdaa15bce8ecb22b635904fc68be025e21249ba44/jiter-0.10.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:5e9251a5e83fab8d87799d3e1a46cb4b7f2919b895c6f4483629ed2446f66522", size = 310866, upload-time = "2025-05-18T19:04:24.891Z" },
    { url = "https://files.pythonhosted.org/packages/ad/0e/26538b158e8a7c7987e94e7aeb2999e2e82b1f9d2e1f6e9874ddf71ebda0/jiter-0.10.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:023aa0204126fe5b87ccbcd75c8a0d0261b9abdbbf46d55e7ae9f8e22424eeb8", size = 318772, upload-time = "2025-05-18T19:04:26.161Z" },
    { url = "https://files.pythonhosted.org/packages/7b/fb/d302893151caa1c2636d6574d213e4b34e31fd077af6050a9c5cbb42f6fb/jiter-0.10.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c189c4f1779c05f75fc17c0c1267594ed918996a231593a21a5ca5438445216", size = 344534, upload-time = "2025-05-18T19:04:27.495Z" },
    { url = "https://files.pythonhosted.org/packages/01/d8/5780b64a149d74e347c5128d82176eb1e3241b1391ac07935693466d6219/jiter-0.10.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:15720084d90d1098ca0229352607cd68256c76991f6b374af96f36920eae13c4", size = 369087, upload-time = "2025-05-18T19:04:28.896Z" },
    { url = "https://files.pythonhosted.org/packages/e8/5b/f235a1437445160e777544f3ade57544daf96ba7e96c1a5b24a6f7ac7004/jiter-0.10.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4f2fb68e5f1cfee30e2b2a09549a00683e0fde4c6a2ab88c94072fc33cb7426", size = 490694, upload-time = "2025-05-18T19:04:30.183Z" },
    { url = "https://files.pythonhosted.org/packages/85/a9/9c3d4617caa2ff89cf61b41e83820c27ebb3f7b5fae8a72901e8cd6ff9be/jiter-0.10.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce541693355fc6da424c08b7edf39a2895f58d6ea17d92cc2b168d20907dee12", size = 388992, upload-time = "2025-05-18T19:04:32.028Z" },
    { url = "https://files.pythonhosted.org/packages/68/b1/344fd14049ba5c94526540af7eb661871f9c54d5f5601ff41a959b9a0bbd/jiter-0.10.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c50c40272e189d50006ad5c73883caabb73d4e9748a688b216e85a9a9ca3b9", size = 351723, upload-time = "2025-05-18T19:04:33.467Z" },
    { url = "https://files.pythonhosted.org/packages/41/89/4c0e345041186f82a31aee7b9d4219a910df672b9fef26f129f0cda07a29/jiter-0.10.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fa3402a2ff9815960e0372a47b75c76979d74402448509ccd49a275fa983ef8a", size = 392215, upload-time = "2025-05-18T19:04:34.827Z" },
    { url = "https://files.pythonhosted.org/packages/55/58/ee607863e18d3f895feb802154a2177d7e823a7103f000df182e0f718b38/jiter-0.10.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:1956f934dca32d7bb647ea21d06d93ca40868b505c228556d3373cbd255ce853", size = 522762, upload-time = "2025-05-18T19:04:36.19Z" },
    { url = "https://files.pythonhosted.org/packages/15/d0/9123fb41825490d16929e73c212de9a42913d68324a8ce3c8476cae7ac9d/jiter-0.10.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:fcedb049bdfc555e261d6f65a6abe1d5ad68825b7202ccb9692636c70fcced86", size = 513427, upload-time = "2025-05-18T19:04:37.544Z" },
    { url = "https://files.pythonhosted.org/packages/d8/b3/2bd02071c5a2430d0b70403a34411fc519c2f227da7b03da9ba6a956f931/jiter-0.10.0-cp314-cp314-win32.whl", hash = "sha256:ac509f7eccca54b2a29daeb516fb95b6f0bd0d0d8084efaf8ed5dfc7b9f0b357", size = 210127, upload-time = "2025-05-18T19:04:38.837Z" },
    { url = "https://files.pythonhosted.org/packages/03/0c/5fe86614ea050c3ecd728ab4035534387cd41e7c1855ef6c031f1ca93e3f/jiter-0.10.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ed975b83a2b8639356151cef5c0d597c68376fc4922b45d0eb384ac058cfa00", size = 318527, upload-time = "2025-05-18T19:04:40.612Z" },
    { url = "https://files.pythonhosted.org/packages/b3/4a/4175a563579e884192ba6e81725fc0448b042024419be8d83aa8a80a3f44/jiter-0.10.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa96f2abba33dc77f79b4cf791840230375f9534e5fac927ccceb58c5e604a5", size = 354213, upload-time = "2025-05-18T19:04:41.894Z" },
]

[[package]]
name = "llmscraper"
version = "0.1.0"
source = { virtual = "." }
dependencies = [
    { name = "anthropic" },
    { name = "apify-client" },
]

[package.metadata]
requires-dist = [
    { name = "anthropic", specifier = ">=0.54.0" },
    { name = "apify-client", specifier = ">=1.11.0" },
]

[[package]]
name = "more-itertools"
version = "10.7.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ce/a0/834b0cebabbfc7e311f30b46c8188790a37f89fc8d756660346fe5abfd09/more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3", size = 127671, upload-time = "2025-04-22T14:17:41.838Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/2b/9f/7ba6f94fc1e9ac3d2b853fdff3035fb2fa5afbed898c4a72b8a020610594/more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e", size = 65278, upload-time = "2025-04-22T14:17:40.49Z" },
]

[[package]]
name = "pydantic"
version = "2.11.7"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "annotated-types" },
    { name = "pydantic-core" },
    { name = "typing-extensions" },
    { name = "typing-inspection" },
]
sdist = { url = "https://files.pythonhosted.org/packages/00/dd/4325abf92c39ba8623b5af936ddb36ffcfe0beae70405d456ab1fb2f5b8c/pydantic-2.11.7.tar.gz", hash = "sha256:d989c3c6cb79469287b1569f7447a17848c998458d49ebe294e975b9baf0f0db", size = 788350, upload-time = "2025-06-14T08:33:17.137Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl", hash = "sha256:dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b", size = 444782, upload-time = "2025-06-14T08:33:14.905Z" },
]

[[package]]
name = "pydantic-core"
version = "2.33.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" },
    { url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" },
    { url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" },
    { url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" },
    { url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" },
    { url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" },
    { url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" },
    { url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" },
    { url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" },
    { url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" },
    { url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" },
    { url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" },
    { url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" },
    { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" },
    { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" },
    { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" },
    { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" },
    { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" },
    { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" },
    { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" },
    { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" },
    { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" },
    { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" },
    { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" },
    { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" },
    { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" },
    { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" },
    { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" },
    { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" },
    { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" },
    { url = "https://files.pythonhosted.org/packages/ec/6b/1ec2c03837ac00886ba8160ce041ce4e325b41d06a034adbef11339ae422/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb8c529b2819c37140eb51b914153063d27ed88e3bdc31b71198a198e921e011", size = 1964199, upload-time = "2025-04-23T18:31:31.025Z" },
    { url = "https://files.pythonhosted.org/packages/2d/1d/6bf34d6adb9debd9136bd197ca72642203ce9aaaa85cfcbfcf20f9696e83/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c52b02ad8b4e2cf14ca7b3d918f3eb0ee91e63b3167c32591e57c4317e134f8f", size = 2120296, upload-time = "2025-04-23T18:31:32.514Z" },
    { url = "https://files.pythonhosted.org/packages/e0/94/2bd0aaf5a591e974b32a9f7123f16637776c304471a0ab33cf263cf5591a/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96081f1605125ba0855dfda83f6f3df5ec90c61195421ba72223de35ccfb2f88", size = 2676109, upload-time = "2025-04-23T18:31:33.958Z" },
    { url = "https://files.pythonhosted.org/packages/f9/41/4b043778cf9c4285d59742281a769eac371b9e47e35f98ad321349cc5d61/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f57a69461af2a5fa6e6bbd7a5f60d3b7e6cebb687f55106933188e79ad155c1", size = 2002028, upload-time = "2025-04-23T18:31:39.095Z" },
    { url = "https://files.pythonhosted.org/packages/cb/d5/7bb781bf2748ce3d03af04d5c969fa1308880e1dca35a9bd94e1a96a922e/pydantic_core-2.33.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:572c7e6c8bb4774d2ac88929e3d1f12bc45714ae5ee6d9a788a9fb35e60bb04b", size = 2100044, upload-time = "2025-04-23T18:31:41.034Z" },
    { url = "https://files.pythonhosted.org/packages/fe/36/def5e53e1eb0ad896785702a5bbfd25eed546cdcf4087ad285021a90ed53/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:db4b41f9bd95fbe5acd76d89920336ba96f03e149097365afe1cb092fceb89a1", size = 2058881, upload-time = "2025-04-23T18:31:42.757Z" },
    { url = "https://files.pythonhosted.org/packages/01/6c/57f8d70b2ee57fc3dc8b9610315949837fa8c11d86927b9bb044f8705419/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:fa854f5cf7e33842a892e5c73f45327760bc7bc516339fda888c75ae60edaeb6", size = 2227034, upload-time = "2025-04-23T18:31:44.304Z" },
    { url = "https://files.pythonhosted.org/packages/27/b9/9c17f0396a82b3d5cbea4c24d742083422639e7bb1d5bf600e12cb176a13/pydantic_core-2.33.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5f483cfb75ff703095c59e365360cb73e00185e01aaea067cd19acffd2ab20ea", size = 2234187, upload-time = "2025-04-23T18:31:45.891Z" },
    { url = "https://files.pythonhosted.org/packages/b0/6a/adf5734ffd52bf86d865093ad70b2ce543415e0e356f6cacabbc0d9ad910/pydantic_core-2.33.2-cp312-cp312-win32.whl", hash = "sha256:9cb1da0f5a471435a7bc7e439b8a728e8b61e59784b2af70d7c169f8dd8ae290", size = 1892628, upload-time = "2025-04-23T18:31:47.819Z" },
    { url = "https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl", hash = "sha256:f941635f2a3d96b2973e867144fde513665c87f13fe0e193c158ac51bfaaa7b2", size = 1955866, upload-time = "2025-04-23T18:31:49.635Z" },
    { url = "https://files.pythonhosted.org/packages/0d/24/8b11e8b3e2be9dd82df4b11408a67c61bb4dc4f8e11b5b0fc888b38118b5/pydantic_core-2.33.2-cp312-cp312-win_arm64.whl", hash = "sha256:cca3868ddfaccfbc4bfb1d608e2ccaaebe0ae628e1416aeb9c4d88c001bb45ab", size = 1888894, upload-time = "2025-04-23T18:31:51.609Z" },
    { url = "https://files.pythonhosted.org/packages/46/8c/99040727b41f56616573a28771b1bfa08a3d3fe74d3d513f01251f79f172/pydantic_core-2.33.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1082dd3e2d7109ad8b7da48e1d4710c8d06c253cbc4a27c1cff4fbcaa97a9e3f", size = 2015688, upload-time = "2025-04-23T18:31:53.175Z" },
    { url = "https://files.pythonhosted.org/packages/3a/cc/5999d1eb705a6cefc31f0b4a90e9f7fc400539b1a1030529700cc1b51838/pydantic_core-2.33.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f517ca031dfc037a9c07e748cefd8d96235088b83b4f4ba8939105d20fa1dcd6", size = 1844808, upload-time = "2025-04-23T18:31:54.79Z" },
    { url = "https://files.pythonhosted.org/packages/6f/5e/a0a7b8885c98889a18b6e376f344da1ef323d270b44edf8174d6bce4d622/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a9f2c9dd19656823cb8250b0724ee9c60a82f3cdf68a080979d13092a3b0fef", size = 1885580, upload-time = "2025-04-23T18:31:57.393Z" },
    { url = "https://files.pythonhosted.org/packages/3b/2a/953581f343c7d11a304581156618c3f592435523dd9d79865903272c256a/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b0a451c263b01acebe51895bfb0e1cc842a5c666efe06cdf13846c7418caa9a", size = 1973859, upload-time = "2025-04-23T18:31:59.065Z" },
    { url = "https://files.pythonhosted.org/packages/e6/55/f1a813904771c03a3f97f676c62cca0c0a4138654107c1b61f19c644868b/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ea40a64d23faa25e62a70ad163571c0b342b8bf66d5fa612ac0dec4f069d916", size = 2120810, upload-time = "2025-04-23T18:32:00.78Z" },
    { url = "https://files.pythonhosted.org/packages/aa/c3/053389835a996e18853ba107a63caae0b9deb4a276c6b472931ea9ae6e48/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb2d542b4d66f9470e8065c5469ec676978d625a8b7a363f07d9a501a9cb36a", size = 2676498, upload-time = "2025-04-23T18:32:02.418Z" },
    { url = "https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d", size = 2000611, upload-time = "2025-04-23T18:32:04.152Z" },
    { url = "https://files.pythonhosted.org/packages/59/a7/63ef2fed1837d1121a894d0ce88439fe3e3b3e48c7543b2a4479eb99c2bd/pydantic_core-2.33.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04a1a413977ab517154eebb2d326da71638271477d6ad87a769102f7c2488c56", size = 2107924, upload-time = "2025-04-23T18:32:06.129Z" },
    { url = "https://files.pythonhosted.org/packages/04/8f/2551964ef045669801675f1cfc3b0d74147f4901c3ffa42be2ddb1f0efc4/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c8e7af2f4e0194c22b5b37205bfb293d166a7344a5b0d0eaccebc376546d77d5", size = 2063196, upload-time = "2025-04-23T18:32:08.178Z" },
    { url = "https://files.pythonhosted.org/packages/26/bd/d9602777e77fc6dbb0c7db9ad356e9a985825547dce5ad1d30ee04903918/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:5c92edd15cd58b3c2d34873597a1e20f13094f59cf88068adb18947df5455b4e", size = 2236389, upload-time = "2025-04-23T18:32:10.242Z" },
    { url = "https://files.pythonhosted.org/packages/42/db/0e950daa7e2230423ab342ae918a794964b053bec24ba8af013fc7c94846/pydantic_core-2.33.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:65132b7b4a1c0beded5e057324b7e16e10910c106d43675d9bd87d4f38dde162", size = 2239223, upload-time = "2025-04-23T18:32:12.382Z" },
    { url = "https://files.pythonhosted.org/packages/58/4d/4f937099c545a8a17eb52cb67fe0447fd9a373b348ccfa9a87f141eeb00f/pydantic_core-2.33.2-cp313-cp313-win32.whl", hash = "sha256:52fb90784e0a242bb96ec53f42196a17278855b0f31ac7c3cc6f5c1ec4811849", size = 1900473, upload-time = "2025-04-23T18:32:14.034Z" },
    { url = "https://files.pythonhosted.org/packages/a0/75/4a0a9bac998d78d889def5e4ef2b065acba8cae8c93696906c3a91f310ca/pydantic_core-2.33.2-cp313-cp313-win_amd64.whl", hash = "sha256:c083a3bdd5a93dfe480f1125926afcdbf2917ae714bdb80b36d34318b2bec5d9", size = 1955269, upload-time = "2025-04-23T18:32:15.783Z" },
    { url = "https://files.pythonhosted.org/packages/f9/86/1beda0576969592f1497b4ce8e7bc8cbdf614c352426271b1b10d5f0aa64/pydantic_core-2.33.2-cp313-cp313-win_arm64.whl", hash = "sha256:e80b087132752f6b3d714f041ccf74403799d3b23a72722ea2e6ba2e892555b9", size = 1893921, upload-time = "2025-04-23T18:32:18.473Z" },
    { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" },
    { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" },
    { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" },
    { url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" },
    { url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" },
    { url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" },
    { url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" },
    { url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" },
    { url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" },
    { url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" },
    { url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" },
    { url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" },
    { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" },
    { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" },
    { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" },
    { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" },
    { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" },
    { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" },
    { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" },
    { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" },
    { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
]

[[package]]
name = "sniffio"
version = "1.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
]

[[package]]
name = "typing-extensions"
version = "4.14.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d1/bc/51647cd02527e87d05cb083ccc402f93e441606ff1f01739a62c8ad09ba5/typing_extensions-4.14.0.tar.gz", hash = "sha256:8676b788e32f02ab42d9e7c61324048ae4c6d844a399eebace3d4979d75ceef4", size = 107423, upload-time = "2025-06-02T14:52:11.399Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/69/e0/552843e0d356fbb5256d21449fa957fa4eff3bbc135a74a691ee70c7c5da/typing_extensions-4.14.0-py3-none-any.whl", hash = "sha256:a1514509136dd0b477638fc68d6a91497af5076466ad0fa6c338e44e359944af", size = 43839, upload-time = "2025-06-02T14:52:10.026Z" },
]

[[package]]
name = "typing-inspection"
version = "0.4.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
    { name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f8/b1/0c11f5058406b3af7609f121aaa6b609744687f1d158b3c3a5bf4cc94238/typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28", size = 75726, upload-time = "2025-05-21T18:55:23.885Z" }
wheels = [
    { url = "https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51", size = 14552, upload-time = "2025-05-21T18:55:22.152Z" },
]

.actor/README.md

1# ScraperCodeGenerator Actor
2
3An intelligent web scraping actor that uses AI-powered quality evaluation and multiple scraping strategies to extract data from websites.
4
5## Features
6
7- **AI-Powered Quality Evaluation**: Uses Claude AI to evaluate the quality of extracted data
8- **Multiple Scraping Strategies**: Employs different scraping techniques for optimal results
9- **Intelligent Pipeline**: Automatically selects the best scraping approach based on the website structure
10
11## Input
12
13The actor expects the following input parameters:
14
15- **Target URL** (required): The URL of the website to scrape
16- **User Goal** (required): Description of what data you want to extract (e.g., "Extract product names, prices, and descriptions")
17- **Claude API Key** (required): Your Anthropic Claude API key for AI-powered evaluation
18
19## Output
20
21The actor returns extracted data in a structured format based on your specified goal. Results are automatically saved to the default dataset.
22
23## Usage
24
251. Provide the target URL you want to scrape
262. Describe what data you want to extract in the "User Goal" field
273. Enter your Claude API key
284. Run the actor
29
30The actor will intelligently analyze the website, extract the requested data, and evaluate the quality of the results using AI.
31
32## Example Input
33
34```json
35{
36    "targetUrl": "https://example-store.com/products",
37    "userGoal": "Extract product names, prices, and availability status",
38    "claudeApiKey": "sk-ant-..."
39}
40```
41
42## Requirements
43
44- Valid Claude API key from Anthropic
45- Target website must be accessible
46- Sufficient Apify compute units for the scraping operation

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "ScraperCodeGenerator",
    "version": "0.1",
    "buildTag": "latest",
    "environmentVariables": {
        "CLAUDE_API_KEY": "@claudeApiKey"
    },
    "dockerfile": "./Dockerfile",
    "input": {
        "title": "Intelligent Scraper Settings",
        "description": "AI-powered web scraper that generates custom scraping code based on your requirements",
        "type": "object",
        "schemaVersion": 1,
        "properties": {
            "targetUrl": {
                "title": "Target URL",
                "description": "The URL of the website you want to scrape",
                "type": "string",
                "editor": "textfield",
                "prefill": "https://books.toscrape.com/"
            },
            "userGoal": {
                "title": "Scraping Goal",
                "description": "Describe what data you want to extract from the website",
                "type": "string",
                "editor": "textarea",
                "prefill": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock."
            },
            "claudeApiKey": {
                "title": "Claude API Key",
                "description": "Your Anthropic Claude API key for AI-powered code generation",
                "type": "string",
                "editor": "textfield",
                "isSecret": true
            },
            "maxRetries": {
                "title": "Max Retries",
                "description": "Maximum number of retry attempts for scraping",
                "type": "integer",
                "minimum": 1,
                "maximum": 10,
                "default": 3
            },
            "timeout": {
                "title": "Timeout (seconds)",
                "description": "Timeout for each scraping attempt in seconds",
                "type": "integer",
                "minimum": 10,
                "maximum": 300,
                "default": 60
            }
        },
        "required": ["targetUrl", "userGoal", "claudeApiKey"]
    },
    "storages": {
        "dataset": {
            "actorSpecification": 1,
            "views": {
                "scraped_data": {
                    "title": "Scraped Data",
                    "transformation": {},
                    "display": {
                        "component": "table",
                        "properties": {
                            "url": {
                                "label": "Source URL",
                                "format": "link"
                            },
                            "title": {
                                "label": "Page Title",
                                "format": "text"
                            },
                            "data": {
                                "label": "Extracted Data",
                                "format": "object"
                            },
                            "timestamp": {
                                "label": "Scraped At",
                                "format": "datetime"
                            },
                            "success": {
                                "label": "Success",
                                "format": "boolean"
                            },
                            "error": {
                                "label": "Error Message",
                                "format": "text"
                            }
                        }
                    }
                }
            }
        }
    }
}

quality_eval/claude_client.py

quality_eval/html_quality_evaluator.py

1"""
2HTML Quality Evaluator Module
3
4This module provides functionality to evaluate the quality of HTML documents 
5fetched by automated browsers. It determines if the HTML is suitable for data 
6extraction or if it represents a failure page (e.g., CAPTCHA, block page, etc.).
7
8Uses Claude AI to perform intelligent analysis based on the user's extraction goals.
9"""
10
11import json
12import logging
13import re
14from typing import Dict, Any, Optional
15from dataclasses import dataclass
16
17import anthropic
18
19
20@dataclass
21class EvaluationResult:
22    """Result of HTML quality evaluation."""
23    score: int  # 1-10 scale
24    reasoning: str
25
26
27@dataclass
28class PreEvaluationResult:
29    """Result of pre-evaluation checks before sending to Claude."""
30    is_valid_html: bool
31    score: Optional[int] = None  # If we can determine score without Claude
32    reasoning: Optional[str] = None
33    should_continue_to_claude: bool = True
34
35
36class HTMLQualityEvaluator:
37    """
38    Evaluates HTML quality for web scraping using Claude AI.
39    
40    This class provides methods to analyze HTML content and determine
41    if it's suitable for data extraction based on the user's goals.
42    """
43    
44    def __init__(self, claude_api_key: str):
45        """
46        Initialize the HTML Quality Evaluator.
47        
48        Args:
49            claude_api_key: Anthropic API key for Claude access
50        """
51        if not claude_api_key or not claude_api_key.strip():
52            raise ValueError("claude_api_key cannot be empty")
53            
54        self.client = anthropic.Anthropic(api_key=claude_api_key)
55        self.logger = logging.getLogger(__name__)
56    
57    def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:
58        """
59        Perform basic validation checks on HTML content before sending to Claude.
60        
61        This method checks for obvious issues that don't require AI analysis:
62        - Whether content is actually HTML
63        - Empty or whitespace-only content
64        
65        Args:
66            html_content: The HTML content to pre-evaluate
67            
68        Returns:
69            PreEvaluationResult indicating if validation passed and whether to continue
70        """
71        if not html_content or not html_content.strip():
72            return PreEvaluationResult(
73                is_valid_html=False,
74                score=1,
75                reasoning="Content is empty or contains only whitespace",
76                should_continue_to_claude=False
77            )
78        
79        content = html_content.strip()
80        content_lower = content.lower()
81        
82        # Check if content looks like HTML at all
83        has_opening_closing_tags = '<' in content and '>' in content
84        
85        if not has_opening_closing_tags:
86            return PreEvaluationResult(
87                is_valid_html=False,
88                score=1,
89                reasoning="Content does not contain HTML tags",
90                should_continue_to_claude=False
91            )
92        
93        # Check for obvious non-HTML content
94        if content.startswith('{') and content.endswith('}'):
95            # Looks like JSON
96            return PreEvaluationResult(
97                is_valid_html=False,
98                score=1,
99                reasoning="Content appears to be JSON, not HTML",
100                should_continue_to_claude=False
101            )
102        
103        if content.startswith('<?xml'):
104            # XML but not HTML
105            return PreEvaluationResult(
106                is_valid_html=False,
107                score=1,
108                reasoning="Content appears to be XML, not HTML",
109                should_continue_to_claude=False
110            )
111        
112        # If it looks like valid HTML, let Claude evaluate it
113        return PreEvaluationResult(
114            is_valid_html=True,
115            should_continue_to_claude=True
116        )
117    
118    def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:
119        """
120        Create the prompt for Claude to evaluate HTML quality.
121        
122        Args:
123            user_goal: The user's data extraction goal
124            html_content: The HTML content to evaluate
125            
126        Returns:
127            Formatted prompt for Claude
128        """
129        return f"""You are a QA automation engineer specializing in web scraping. Your task is to evaluate the quality of an HTML document that was fetched by an automated browser. The goal is to determine if the HTML is suitable for data extraction or if it looks like a failure page (e.g., a CAPTCHA, block page, or empty loading page).
130
131You will be given the user's original data extraction goal and the full HTML content from a scraping attempt.
132
133REQUIREMENTS:
134- Analysis: Analyze the provided HTML for common indicators of success or failure.
135- Success Indicators: Meaningful content in the <body>, presence of <h1>-<h6> tags, lists (<ul>, <ol>), tables (<table>), structured <div>s with descriptive class names.
136- Failure Indicators: Common keywords like "CAPTCHA", "bot detection", "enable JavaScript", "access denied", "rate limit". An empty or near-empty <body> tag. A structure that looks like a loading spinner.
137- Output Format: Your response MUST be a single, well-formed JSON object with the following two keys:
138  "score": An integer from 1 (unusable) to 10 (perfectly rendered and content-rich).
139  "reasoning": A brief, one-sentence explanation for your score.
140- Contextual Awareness: Use the "User's Goal" to inform your analysis. If the user wants "product listings" and you see a grid of items, that's a high score. If you see a login form, that's a low score.
141
142[INPUT 1] User's Goal:
143{user_goal}
144
145[INPUT 2] HTML Content from an Actor Run:
146{html_content}
147
148Please analyze the HTML and provide your evaluation as a JSON object."""
149
150    def _parse_claude_response(self, response_text: str) -> Optional[EvaluationResult]:
151        """
152        Parse Claude's response and extract the evaluation result.
153        
154        Args:
155            response_text: Raw response from Claude
156            
157        Returns:
158            EvaluationResult if parsing successful, None otherwise
159        """
160        try:
161            # Try to find JSON in the response
162            json_match = re.search(r'\{[^}]*"score"[^}]*\}', response_text, re.DOTALL)
163            if not json_match:
164                self.logger.error("No JSON object found in Claude's response")
165                return None
166            
167            json_str = json_match.group(0)
168            result_dict = json.loads(json_str)
169            
170            # Validate required fields
171            if 'score' not in result_dict or 'reasoning' not in result_dict:
172                self.logger.error("Missing required fields in Claude's response")
173                return None
174            
175            score = int(result_dict['score'])
176            if not (1 <= score <= 10):
177                self.logger.error(f"Score {score} is outside valid range 1-10")
178                return None
179            
180            return EvaluationResult(
181                score=score,
182                reasoning=result_dict['reasoning']
183            )
184            
185        except (json.JSONDecodeError, ValueError, KeyError) as e:
186            self.logger.error(f"Failed to parse Claude's response: {e}")
187            return None
188    
189    def _preprocess_html(self, html_content: str) -> str:
190        """
191        Preprocess HTML content for analysis.
192        
193        Args:
194            html_content: Raw HTML content
195            
196        Returns:
197            Preprocessed HTML content
198        """
199        # Truncate very long HTML to avoid token limits
200        max_length = 100000  # Adjust based on Claude's token limits
201        if len(html_content) > max_length:
202            self.logger.warning(f"HTML content truncated from {len(html_content)} to {max_length} characters")
203            html_content = html_content[:max_length] + "\n<!-- Content truncated for analysis -->"
204        
205        return html_content
206    
207    def get_pre_evaluation_info(self, html_content: str) -> PreEvaluationResult:
208        """
209        Get detailed pre-evaluation information without performing the full evaluation.
210        
211        This method allows users to see what the pre-evaluation checks detected
212        without calling Claude if they just want to understand the basic validation.
213        
214        Args:
215            html_content: The HTML content to pre-evaluate
216            
217        Returns:
218            PreEvaluationResult with detailed validation information
219        """
220        return self._pre_evaluate_html(html_content)
221    
222    def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
223        """
224        Evaluate the quality of HTML content for data extraction.
225        
226        This is the main function that analyzes HTML content to determine if it's 
227        suitable for data extraction based on the user's goals.
228        
229        The evaluation process consists of two stages:
230        1. Pre-evaluation: Basic validation checks for HTML structure, error pages, 
231           and obvious content issues that don't require AI analysis
232        2. Claude AI analysis: Detailed content evaluation using AI if pre-evaluation passes
233        
234        Args:
235            user_goal: The user's data extraction goal/objective
236            html_content: The HTML content to evaluate
237            
238        Returns:
239            EvaluationResult containing score (1-10) and reasoning, or None if evaluation failed
240            
241        Example:
242            >>> evaluator = HTMLQualityEvaluator("your-claude-api-key")
243            >>> result = evaluator.evaluate_html_quality(
244            ...     "I want to get a list of all articles on the homepage",
245            ...     "<html><body><article>...</article></body></html>"
246            ... )
247            >>> if result:
248            ...     print(f"Score: {result.score}, Reasoning: {result.reasoning}")
249        """
250        if not user_goal or not user_goal.strip():
251            raise ValueError("user_goal cannot be empty")
252        
253        # Perform pre-evaluation checks on the HTML content
254        self.logger.info("Starting pre-evaluation checks on HTML content")
255        pre_eval_result = self._pre_evaluate_html(html_content)
256        
257        if not pre_eval_result.should_continue_to_claude:
258            self.logger.info(f"Pre-evaluation completed without Claude - Score: {pre_eval_result.score}")
259            return EvaluationResult(
260                score=pre_eval_result.score,
261                reasoning=pre_eval_result.reasoning
262            )
263        
264        # Log pre-evaluation results
265        if not pre_eval_result.is_valid_html:
266            self.logger.warning("Pre-evaluation detected non-HTML content, but continuing to Claude")
267        else:
268            self.logger.info("Pre-evaluation passed, proceeding to Claude analysis")
269        
270        try:
271            # Preprocess HTML content
272            processed_html = self._preprocess_html(html_content)
273            
274            # Create the evaluation prompt
275            prompt = self._create_evaluation_prompt(user_goal, processed_html)
276            
277            self.logger.info("Sending HTML evaluation request to Claude")
278            
279            # Send request to Claude
280            response = self.client.messages.create(
281                model="claude-3-5-sonnet-20241022", # TODO use 4
282                max_tokens=1000,
283                messages=[
284                    {
285                        "role": "user",
286                        "content": prompt
287                    }
288                ]
289            )
290            
291            # Extract and parse the response
292            response_text = response.content[0].text
293            result = self._parse_claude_response(response_text)
294            
295            if result:
296                self.logger.info(f"HTML evaluation completed - Score: {result.score}")
297            else:
298                self.logger.error("Failed to parse evaluation result from Claude")
299            
300            return result
301            
302        except anthropic.APIError as e:
303            self.logger.error(f"Claude API error: {e}")
304            return None
305        except Exception as e:
306            self.logger.error(f"Unexpected error during HTML evaluation: {e}", exc_info=True)
307            return None

manual_scraping/apify_runner.py

1"""
2Apify Actor Runner Module
3
4This module provides functionality to run Apify actors and retrieve their results
5using the official Apify Python client library.
6"""
7
8import logging
9from typing import Optional, Dict, Any, List, Union
10
11from apify_client import ApifyClient
12
13
14def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:
15    """
16    Run an Apify actor and retrieve its dataset results.
17    
18    This function is maintained for backward compatibility. For more flexible
19    data retrieval (including key-value stores), use run_apify_actor_with_flexible_retrieval.
20    
21    Args:
22        actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')
23        actor_input: Dictionary containing the input configuration for the actor
24        api_token: Apify API token for authentication (keyword-only argument)
25    
26    Returns:
27        List of dictionaries containing the dataset items if successful, None otherwise.
28        Returns an empty list if the run succeeds but produces no data.
29    
30    Raises:
31        ValueError: If actor_id or api_token is empty
32        
33    Example:
34        >>> input_data = {"startUrls": [{"url": "https://example.com"}]}
35        >>> results = run_apify_actor("apify/web-scraper", input_data, api_token="your_token")
36        >>> if results is not None:
37        ...     print(f"Scraped {len(results)} items")
38    """
39    result = run_apify_actor_with_flexible_retrieval(
40        actor_id=actor_id,
41        actor_input=actor_input,
42        api_token=api_token,
43        retrieve_from="dataset"
44    )
45    
46    # Ensure we return a list for backward compatibility
47    if isinstance(result, list):
48        return result
49    else:
50        return None
51
52
53def run_apify_actor_with_flexible_retrieval(
54    actor_id: str, 
55    actor_input: dict, 
56    *, 
57    api_token: str,
58    retrieve_from: str = "auto"
59) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
60    """
61    Run an Apify actor and retrieve its results from dataset or key-value store.
62    
63    This function starts an Apify actor with the provided input, waits for the run
64    to complete, and returns data from either the dataset or key-value store based
65    on the actor type or explicit configuration.
66    
67    Args:
68        actor_id: The ID or name of the Apify actor to run (e.g., 'apify/web-scraper')
69        actor_input: Dictionary containing the input configuration for the actor
70        api_token: Apify API token for authentication (keyword-only argument)
71        retrieve_from: Where to retrieve data from. Options:
72                      - "auto": Automatically detect based on actor_id
73                      - "dataset": Retrieve from dataset only
74                      - "key-value-store": Retrieve from key-value store only
75                      - "both": Retrieve from both and return combined results
76    
77    Returns:
78        - If retrieve_from is "dataset": List of dictionaries from dataset
79        - If retrieve_from is "key-value-store": Dictionary with key-value store items
80        - If retrieve_from is "both": Dictionary with 'dataset' and 'key_value_store' keys
81        - If retrieve_from is "auto": Appropriate format based on actor type
82        Returns None if the run fails.
83    
84    Raises:
85        ValueError: If actor_id, api_token is empty, or retrieve_from is invalid
86        
87    Example:
88        >>> input_data = {"startUrls": [{"url": "https://example.com"}]}
89        >>> # Auto-detect storage type
90        >>> results = run_apify_actor_with_flexible_retrieval(
91        ...     "apify/website-content-crawler", 
92        ...     input_data, 
93        ...     api_token="your_token"
94        ... )
95        >>> # Explicitly use key-value store
96        >>> results = run_apify_actor_with_flexible_retrieval(
97        ...     "apify/web-scraper", 
98        ...     input_data, 
99        ...     api_token="your_token",
100        ...     retrieve_from="key-value-store"
101        ... )
102    """
103    # Input validation
104    if not actor_id or not actor_id.strip():
105        raise ValueError("actor_id cannot be empty")
106    if not api_token or not api_token.strip():
107        raise ValueError("api_token cannot be empty")
108    if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:
109        raise ValueError("retrieve_from must be 'auto', 'dataset', 'key-value-store', or 'both'")
110    
111    # Configure logging
112    logger = logging.getLogger(__name__)
113    
114    # Determine storage type based on actor if auto mode
115    if retrieve_from == "auto":
116        if "website-content-crawler" in actor_id:
117            retrieve_from = "key-value-store"
118        else:
119            retrieve_from = "dataset"
120    
121    try:
122        # Initialize the Apify client
123        client = ApifyClient(api_token)
124        
125        logger.info(f"Starting Apify actor: {actor_id}")
126        
127        # Start the actor run
128        run = client.actor(actor_id).call(run_input=actor_input)
129        
130        # Check if the run was created successfully
131        if not run:
132            logger.error(f"Failed to start actor run for {actor_id}")
133            return None
134        
135        run_id = run.get('id')
136        status = run.get('status')
137        
138        logger.info(f"Actor run started with ID: {run_id}, Status: {status}")
139        
140        # Check the final status of the run
141        if status != 'SUCCEEDED':
142            logger.warning(
143                f"Actor run {run_id} did not succeed. "
144                f"Final status: {status}. "
145                f"Exit code: {run.get('exitCode', 'N/A')}"
146            )
147            return None
148        
149        logger.info(f"Actor run {run_id} completed successfully")
150        
151        # Retrieve data based on the specified method
152        if retrieve_from == "dataset":
153            dataset_client = client.dataset(run.get('defaultDatasetId'))
154            dataset_items = list(dataset_client.iterate_items())
155            logger.info(f"Retrieved {len(dataset_items)} items from dataset")
156            return dataset_items
157        
158        elif retrieve_from == "key-value-store":
159            kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))
160            
161            # List all keys in the key-value store
162            keys_response = kv_store_client.list_keys()
163            keys = [item['key'] for item in keys_response.get('items', [])]
164            
165            if not keys:
166                logger.warning("No keys found in key-value store")
167                return {}
168            
169            # Retrieve all key-value pairs
170            kv_items = {}
171            for key in keys:
172                try:
173                    value = kv_store_client.get_record(key)
174                    if value:
175                        kv_items[key] = value.get('value')
176                except Exception as e:
177                    logger.warning(f"Failed to retrieve key '{key}': {str(e)}")
178            
179            logger.info(f"Retrieved {len(kv_items)} items from key-value store")
180            return kv_items
181        
182        elif retrieve_from == "both":
183            # Retrieve from both sources
184            results = {"dataset": [], "key_value_store": {}}
185            
186            # Get dataset items
187            try:
188                dataset_client = client.dataset(run.get('defaultDatasetId'))
189                dataset_items = list(dataset_client.iterate_items())
190                results["dataset"] = dataset_items
191                logger.info(f"Retrieved {len(dataset_items)} items from dataset")
192            except Exception as e:
193                logger.warning(f"Failed to retrieve dataset items: {str(e)}")
194            
195            # Get key-value store items
196            try:
197                kv_store_client = client.key_value_store(run.get('defaultKeyValueStoreId'))
198                keys_response = kv_store_client.list_keys()
199                keys = [item['key'] for item in keys_response.get('items', [])]
200                
201                kv_items = {}
202                for key in keys:
203                    try:
204                        value = kv_store_client.get_record(key)
205                        if value:
206                            kv_items[key] = value.get('value')
207                    except Exception as e:
208                        logger.warning(f"Failed to retrieve key '{key}': {str(e)}")
209                
210                results["key_value_store"] = kv_items
211                logger.info(f"Retrieved {len(kv_items)} items from key-value store")
212            except Exception as e:
213                logger.warning(f"Failed to retrieve key-value store items: {str(e)}")
214            
215            return results
216        
217    except Exception as e:
218        logger.error(f"Error running Apify actor {actor_id}: {str(e)}", exc_info=True)
219        return None

manual_scraping/multi_actor_scraper.py

1"""
2Multi-Actor Website Scraper
3
4This module provides functionality to scrape websites using multiple Apify actors
5simultaneously and return a dictionary of actor-scraped HTML pairs.
6"""
7
8import asyncio
9import logging
10from typing import Dict, List, Optional, Any
11from concurrent.futures import ThreadPoolExecutor, as_completed
12import json
13
14from .apify_runner import run_apify_actor_with_flexible_retrieval, run_apify_actor
15
16
17class MultiActorScraper:
18    """
19    A class to scrape websites using multiple Apify actors simultaneously.
20    """
21    
22    def __init__(self, api_token: str):
23        """
24        Initialize the MultiActorScraper.
25        
26        Args:
27            api_token: Apify API token for authentication
28        """
29        self.api_token = api_token
30        self.logger = logging.getLogger(__name__)
31        
32        # Configure logging
33        logging.basicConfig(
34            level=logging.INFO,
35            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
36        )
37    
38    def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:
39        """
40        Get the configuration for each actor with the target URL.
41        
42        Args:
43            target_url: The URL to scrape
44            
45        Returns:
46            Dictionary mapping actor names to their configurations
47        """
48        return {
49            "cheerio-scraper": {
50                "actor_id": "apify/cheerio-scraper",
51                "input": {
52                    "debugLog": False,
53                    "forceResponseEncoding": False,
54                    "ignoreSslErrors": False,
55                    "keepUrlFragments": False,
56                    "pageFunction": """async function pageFunction(context) {
57    const { $, request, log } = context;
58
59    const url = request.url;
60    const html = $.html();  // Get the full HTML of the page
61
62    log.info('Page scraped', { url });
63
64    return {
65        url,
66        html
67    };
68}""",
69                    "postNavigationHooks": """// We need to return array of (possibly async) functions here.
70// The functions accept a single argument: the "crawlingContext" object.
71[
72    async (crawlingContext) => {
73        // ...
74    },
75]""",
76                    "preNavigationHooks": """// We need to return array of (possibly async) functions here.
77// The functions accept two arguments: the "crawlingContext" object
78// and "requestAsBrowserOptions" which are passed to the `requestAsBrowser()`
79// function the crawler calls to navigate..
80[
81    async (crawlingContext, requestAsBrowserOptions) => {
82        // ...
83    }
84]""",
85                    "proxyConfiguration": {
86                        "useApifyProxy": True
87                    },
88                    "respectRobotsTxtFile": False,
89                    "startUrls": [
90                        {
91                            "url": target_url,
92                            "method": "GET"
93                        }
94                    ]
95                }
96            },
97            
98            "website-content-crawler": {
99                "actor_id": "apify/website-content-crawler",
100                "input": {
101                    "aggressivePrune": False,
102                    "clickElementsCssSelector": "[aria-expanded=\"false\"]",
103                    "clientSideMinChangePercentage": 15,
104                    "crawlerType": "playwright:adaptive",
105                    "debugLog": False,
106                    "debugMode": False,
107                    "dynamicContentWaitSecs": 15,
108                    "expandIframes": True,
109                    "ignoreCanonicalUrl": False,
110                    "keepUrlFragments": False,
111                    "maxCrawlDepth": 0,
112                    "proxyConfiguration": {
113                        "useApifyProxy": True,
114                        "apifyProxyGroups": [
115                            "RESIDENTIAL"
116                        ]
117                    },
118                    "readableTextCharThreshold": 100,
119                    "removeCookieWarnings": True,
120                    "renderingTypeDetectionPercentage": 10,
121                    "respectRobotsTxtFile": False,
122                    "saveFiles": False,
123                    "saveHtml": False,
124                    "saveHtmlAsFile": True,
125                    "saveMarkdown": False,
126                    "saveScreenshots": False,
127                    "startUrls": [
128                        {
129                            "url": target_url,
130                            "method": "GET"
131                        }
132                    ],
133                    "useSitemaps": False
134                }
135            },
136            
137            "web-scraper": {
138                "actor_id": "apify/web-scraper",
139                "input": {
140                    "breakpointLocation": "NONE",
141                    "browserLog": False,
142                    "closeCookieModals": False,
143                    "debugLog": False,
144                    "downloadCss": True,
145                    "downloadMedia": True,
146                    "headless": True,
147                    "ignoreCorsAndCsp": False,
148                    "ignoreSslErrors": False,
149                    "injectJQuery": True,
150                    "keepUrlFragments": False,
151                    "pageFunction": """// The function accepts a single argument: the "context" object.
152// For a complete list of its properties and functions,
153// see https://apify.com/apify/web-scraper#page-function 
154async function pageFunction(context) {
155    // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
156    // debugger; 
157
158    // jQuery is handy for finding DOM elements and extracting data from them.
159    // To use it, make sure to enable the "Inject jQuery" option.
160    const $ = context.jQuery;
161    const url = context.request.url;
162    const html = $('html').html();  // Get the full HTML of the page
163
164    // Print some information to Actor log
165    context.log.info(`URL: ${url}, HTML length: ${html ? html.length : 0}`);
166
167    // Return an object with the data extracted from the page.
168    // It will be stored to the resulting dataset.
169    return {
170        url,
171        html
172    };
173}""",
174                    "postNavigationHooks": """// We need to return array of (possibly async) functions here.
175// The functions accept a single argument: the "crawlingContext" object.
176[
177    async (crawlingContext) => {
178        // ...
179    },
180]""",
181                    "preNavigationHooks": """// We need to return array of (possibly async) functions here.
182// The functions accept two arguments: the "crawlingContext" object
183// and "gotoOptions".
184[
185    async (crawlingContext, gotoOptions) => {
186        // ...
187    },
188]""",
189                    "proxyConfiguration": {
190                        "useApifyProxy": True
191                    },
192                    "respectRobotsTxtFile": False,
193                    "runMode": "PRODUCTION",
194                    "startUrls": [
195                        {
196                            "url": target_url,
197                            "method": "GET"
198                        }
199                    ],
200                    "useChrome": False,
201                    "waitUntil": [
202                        "networkidle2"
203                    ]
204                }
205            }
206        }
207    
208    def _run_single_actor(self, actor_name: str, actor_config: Dict[str, Any]) -> tuple[str, Optional[str]]:
209        """
210        Run a single actor and extract HTML content.
211        
212        Args:
213            actor_name: Name of the actor
214            actor_config: Configuration for the actor
215            
216        Returns:
217            Tuple of (actor_name, html_content or None)
218        """
219        try:
220            self.logger.info(f"Starting {actor_name}...")
221            
222            # Determine retrieval method based on actor type
223            if "website-content-crawler" in actor_config["actor_id"]:
224                retrieve_from = "key-value-store"
225            else:
226                retrieve_from = "dataset"
227            
228            results = run_apify_actor_with_flexible_retrieval(
229                actor_config["actor_id"],
230                actor_config["input"],
231                api_token=self.api_token,
232                retrieve_from=retrieve_from
233            )
234            
235            if not results:
236                self.logger.warning(f"{actor_name} returned no results")
237                return actor_name, None
238            
239            # Extract HTML from results
240            html_content = None
241            
242            # Handle different result formats
243            if isinstance(results, list):
244                # Dataset results (list format)
245                for item in results:
246                    if isinstance(item, dict):
247                        # For cheerio-scraper and web-scraper
248                        if 'html' in item:
249                            html_content = item['html']
250                            break
251                        # For other actors that might use different keys
252                        elif 'text' in item:
253                            html_content = item['text']
254                            break
255                        elif 'content' in item:
256                            html_content = item['content']
257                            break
258                    # If it's a string (sometimes the whole result is HTML)
259                    elif isinstance(item, str):
260                        html_content = item
261                        break
262            
263            elif isinstance(results, dict):
264                # Key-value store results (dict format) - for website-content-crawler
265                # Look for HTML files in the key-value store
266                for key, value in results.items():
267                    if key.endswith('.html') or 'html' in key.lower():
268                        if isinstance(value, str):
269                            html_content = value
270                            break
271                        elif isinstance(value, dict) and 'value' in value:
272                            html_content = value['value']
273                            break
274                
275                # If no HTML file found, look for other common keys
276                if not html_content:
277                    for key, value in results.items():
278                        if isinstance(value, str) and len(value) > 100:  # Likely HTML content
279                            html_content = value
280                            break
281                        elif isinstance(value, dict):
282                            # Check if the value dict contains HTML
283                            if 'html' in value:
284                                html_content = value['html']
285                                break
286                            elif 'content' in value:
287                                html_content = value['content']
288                                break
289            
290            if html_content:
291                self.logger.info(f"{actor_name} completed successfully - HTML length: {len(html_content)}")
292            else:
293                self.logger.warning(f"{actor_name} completed but no HTML content found in results")
294                # Log the structure of the first result for debugging
295                if isinstance(results, list) and results:
296                    self.logger.debug(f"{actor_name} result structure: {list(results[0].keys()) if isinstance(results[0], dict) else type(results[0])}")
297                elif isinstance(results, dict):
298                    self.logger.debug(f"{actor_name} key-value store keys: {list(results.keys())}")
299            
300            return actor_name, html_content
301            
302        except Exception as e:
303            self.logger.error(f"Error running {actor_name}: {str(e)}", exc_info=True)
304            return actor_name, None
305    
306    def scrape_with_multiple_actors(self, target_url: str, max_workers: int = 4) -> Dict[str, Optional[str]]:
307        """
308        Scrape a website using multiple Apify actors simultaneously.
309        
310        Args:
311            target_url: The URL to scrape
312            max_workers: Maximum number of concurrent workers
313            
314        Returns:
315            Dictionary mapping actor names to their scraped HTML content
316        """
317        self.logger.info(f"Starting multi-actor scraping for URL: {target_url}")
318        
319        actor_configs = self._get_actor_configs(target_url)
320        results = {}
321        
322        with ThreadPoolExecutor(max_workers=max_workers) as executor:
323            # Submit all actor runs
324            future_to_actor = {
325                executor.submit(self._run_single_actor, actor_name, config): actor_name
326                for actor_name, config in actor_configs.items()
327            }
328            
329            # Collect results as they complete
330            for future in as_completed(future_to_actor):
331                actor_name = future_to_actor[future]
332                try:
333                    actor_name_result, html_content = future.result()
334                    results[actor_name_result] = html_content
335                    
336                except Exception as e:
337                    self.logger.error(f"Error getting result for {actor_name}: {str(e)}")
338                    results[actor_name] = None
339        
340        # Log summary
341        successful_actors = [name for name, content in results.items() if content is not None]
342        failed_actors = [name for name, content in results.items() if content is None]
343        
344        self.logger.info(f"Scraping completed!")
345        self.logger.info(f"Successful actors: {successful_actors}")
346        if failed_actors:
347            self.logger.warning(f"Failed actors: {failed_actors}")
348        
349        return results
350    
351    def save_results_to_files(self, results: Dict[str, Optional[str]], output_dir: str = "scraped_results") -> None:
352        """
353        Save the scraped HTML results to separate files.
354        
355        Args:
356            results: Dictionary of actor-HTML pairs
357            output_dir: Directory to save the files
358        """
359        import os
360        
361        if not os.path.exists(output_dir):
362            os.makedirs(output_dir)
363        
364        for actor_name, html_content in results.items():
365            if html_content:
366                filename = f"{actor_name.replace('/', '_')}_result.html"
367                filepath = os.path.join(output_dir, filename)
368                
369                with open(filepath, 'w', encoding='utf-8') as f:
370                    f.write(html_content)
371                
372                self.logger.info(f"Saved {actor_name} result to {filepath}")
373            else:
374                self.logger.warning(f"No content to save for {actor_name}")
375    
376    def scrape_with_single_actor_flexible(target_url: str, actor_id: str, api_token: str, 
377                                        custom_input: Optional[Dict[str, Any]] = None) -> Optional[str]:
378        """
379        Convenience function to scrape a website using a single Apify actor with flexible retrieval.
380        
381        This function automatically detects whether to use dataset or key-value store based on the actor type.
382        
383        Args:
384            target_url: The URL to scrape
385            actor_id: The Apify actor ID (e.g., 'apify/website-content-crawler')
386            api_token: Apify API token
387            custom_input: Optional custom input configuration for the actor
388            
389        Returns:
390            HTML content as string if successful, None otherwise
391            
392        Example:
393            >>> # For website-content-crawler (uses key-value store)
394            >>> html = scrape_with_single_actor_flexible(
395            ...     "https://example.com", 
396            ...     "apify/website-content-crawler", 
397            ...     api_token
398            ... )
399            >>> # For web-scraper (uses dataset)
400            >>> html = scrape_with_single_actor_flexible(
401            ...     "https://example.com", 
402            ...     "apify/web-scraper", 
403            ...     api_token
404            ... )
405        """
406        logger = logging.getLogger(__name__)
407        
408        try:
409            # Use default input if none provided
410            if custom_input is None:
411                if "website-content-crawler" in actor_id:
412                    actor_input = {
413                        "aggressivePrune": False,
414                        "clickElementsCssSelector": "[aria-expanded=\"false\"]",
415                        "clientSideMinChangePercentage": 15,
416                        "crawlerType": "playwright:adaptive",
417                        "debugLog": False,
418                        "debugMode": False,
419                        "dynamicContentWaitSecs": 15,
420                        "expandIframes": True,
421                        "ignoreCanonicalUrl": False,
422                        "keepUrlFragments": False,
423                        "proxyConfiguration": {
424                            "useApifyProxy": True,
425                            "apifyProxyGroups": ["RESIDENTIAL"]
426                        },
427                        "readableTextCharThreshold": 100,
428                        "removeCookieWarnings": True,
429                        "renderingTypeDetectionPercentage": 10,
430                        "respectRobotsTxtFile": False,
431                        "saveFiles": False,
432                        "saveHtml": False,
433                        "saveHtmlAsFile": True,
434                        "saveMarkdown": False,
435                        "saveScreenshots": False,
436                        "startUrls": [{"url": target_url, "method": "GET"}],
437                        "useSitemaps": False
438                    }
439                else:
440                    # Default input for other actors
441                    actor_input = {
442                        "startUrls": [{"url": target_url, "method": "GET"}],
443                        "proxyConfiguration": {"useApifyProxy": True}
444                    }
445            else:
446                actor_input = custom_input
447            
448            # Determine retrieval method based on actor type
449            if "website-content-crawler" in actor_id:
450                retrieve_from = "key-value-store"
451            else:
452                retrieve_from = "dataset"
453            
454            # Run the actor with flexible retrieval
455            results = run_apify_actor_with_flexible_retrieval(
456                actor_id=actor_id,
457                actor_input=actor_input,
458                api_token=api_token,
459                retrieve_from=retrieve_from
460            )
461            
462            if not results:
463                logger.warning(f"No results returned from {actor_id}")
464                return None
465            
466            # Extract HTML content
467            html_content = None
468            
469            if isinstance(results, list):
470                # Dataset results
471                for item in results:
472                    if isinstance(item, dict) and 'html' in item:
473                        html_content = item['html']
474                        break
475            elif isinstance(results, dict):
476                # Key-value store results
477                for key, value in results.items():
478                    if key.endswith('.html') or 'html' in key.lower():
479                        if isinstance(value, str):
480                            html_content = value
481                            break
482                        elif isinstance(value, dict) and 'value' in value:
483                            html_content = value['value']
484                            break
485            
486            if html_content:
487                logger.info(f"Successfully scraped {len(html_content)} characters from {target_url}")
488            else:
489                logger.warning(f"No HTML content found in results from {actor_id}")
490                logger.debug(f"Result structure: {type(results)} with keys: {list(results.keys()) if isinstance(results, dict) else 'N/A'}")
491            
492            return html_content
493            
494        except Exception as e:
495            logger.error(f"Error scraping with {actor_id}: {str(e)}", exc_info=True)
496            return None
497
498
499def scrape_website_with_multiple_actors(target_url: str, api_token: str) -> Dict[str, Optional[str]]:
500    """
501    Convenience function to scrape a website using multiple Apify actors.
502    
503    Args:
504        target_url: The URL to scrape
505        api_token: Apify API token
506        
507    Returns:
508        Dictionary mapping actor names to their scraped HTML content
509    """
510    scraper = MultiActorScraper(api_token)
511    return scraper.scrape_with_multiple_actors(target_url)
512
513
514if __name__ == "__main__":
515    # Example usage
516    import os
517    
518    # Get API token from environment variable
519    api_token = os.getenv("APIFY_TOKEN")
520    if not api_token:
521        print("Please set the APIFY_TOKEN environment variable")
522        exit(1)
523    
524    # Example URL to scrape
525    target_url = "https://fbref.com/en/players/3d1f29d9/matchlogs/2021-2022/Jordyn-Huitema-Match-Logs"
526    
527    # Create scraper and run
528    scraper = MultiActorScraper(api_token)
529    results = scraper.scrape_with_multiple_actors(target_url)
530    
531    # Print results summary
532    print("\n=== SCRAPING RESULTS ===")
533    for actor_name, html_content in results.items():
534        if html_content:
535            print(f"{actor_name}: SUCCESS (HTML length: {len(html_content)})")
536        else:
537            print(f"{actor_name}: FAILED")
538    
539    # Optionally save results to files
540    scraper.save_results_to_files(results)

tests/examples.json

[
  {
    "name": "Use Case 1",
    "page_url": "https://books.toscrape.com/",
    "goal": "Get me a list of all the books on the first page. For each book, I want its title, price, star rating, and whether it is in stock."
  },
  {
    "name": "Use Case 2",
    "page_url": "https://www.theverge.com/",
    "goal": "I want to scrape the main articles from The Verge homepage. For each article, get me the headline, the author's name, and the link to the full article."
  },
  {
    "name": "Use Case 3",
    "page_url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "goal": "Get me information about the Python programming language from its Wikipedia page. I need the 'First appeared' date, the 'Stable release' version, and the official website from the infobox on the right."
  },
  {
    "name": "Use Case 4",
    "page_url": "https://www.python.org/jobs/",
    "goal": "List all the jobs posted. For each job, I want the job title, the company name, the location, and the date it was posted."
  },
  {
    "name": "Use Case 5",
    "page_url": "https://quotes.toscrape.com/",
    "goal": "I want a list of all quotes on this page. For each one, get the quote text itself, the name of the author, and a list of the tags associated with it."
  }
]

src/llmscraper/init.py

1"""
2ScraperCodeGenerator - Intelligent Web Scraping with AI
3
4A smart web scraping framework that uses multiple scraping strategies
5and AI-powered quality evaluation to extract data from websites.
6"""
7
8from .pipeline import IntelligentScraperPipeline, run_intelligent_scraper
9from .models import ScrapingResult, GoalExtractionResult
10from .main import main, extract_goal_from_prompt
11from .scraping.actor_multi_scraper import ActorMultiScraper
12
13__version__ = "0.1.0"
14__all__ = [
15    "IntelligentScraperPipeline",
16    "run_intelligent_scraper",
17    "ScrapingResult",
18    "GoalExtractionResult",
19    "main",
20    "extract_goal_from_prompt",
21    "ActorMultiScraper"
22]

src/llmscraper/main.py

1"""
2Main entry point and natural language processing for LLMScraper.
3"""
4
5import argparse
6import logging
7import re
8import sys
9from typing import Optional
10
11import anthropic
12
13from .models import ScrapingResult, GoalExtractionResult
14from .pipeline import run_intelligent_scraper
15from .utils import get_api_key, setup_logging
16
17
18def extract_goal_from_prompt(prompt: str, claude_api_key: str) -> GoalExtractionResult:
19    """
20    Extract URL and scraping goal from a natural language prompt.
21    
22    Args:
23        prompt: Natural language prompt containing URL and goal
24        claude_api_key: Claude API key for processing
25        
26    Returns:
27        GoalExtractionResult with extracted information
28    """
29    try:
30        client = anthropic.Anthropic(api_key=claude_api_key)
31        
32        system_prompt = """You are an expert at parsing natural language requests for web scraping. 
33Extract the URL and scraping goal from the user's prompt.
34
35Return your answer in this EXACT JSON format:
36{
37    "url": "extracted_url_here",
38    "goal": "clear_extraction_goal_here"
39}
40
41The goal should be specific and actionable for web scraping (e.g., "Get all product names and prices" rather than "scrape this site").
42
43Only return the JSON, no other text."""
44        
45        response = client.messages.create(
46            model="claude-3-5-sonnet-20241022",
47            max_tokens=500,
48            messages=[
49                {
50                    "role": "user", 
51                    "content": f"Extract the URL and scraping goal from this prompt: {prompt}"
52                }
53            ],
54            system=system_prompt
55        )
56        
57        content = response.content[0].text
58        
59        # Extract JSON from response
60        json_match = re.search(r'\{.*\}', content, re.DOTALL)
61        if not json_match:
62            return GoalExtractionResult(
63                goal="",
64                url="",
65                success=False,
66                error_message="Could not extract JSON from Claude response"
67            )
68        
69        import json
70        data = json.loads(json_match.group())
71        
72        url = data.get('url', '').strip()
73        goal = data.get('goal', '').strip()
74        
75        if not url or not goal:
76            return GoalExtractionResult(
77                goal=goal,
78                url=url,
79                success=False,
80                error_message="Missing URL or goal in extracted data"
81            )
82        
83        return GoalExtractionResult(
84            goal=goal,
85            url=url,
86            success=True
87        )
88        
89    except Exception as e:
90        return GoalExtractionResult(
91            goal="",
92            url="",
93            success=False,
94            error_message=f"Error extracting goal from prompt: {str(e)}"
95        )
96
97
98def main():
99    """Main CLI entry point."""
100    parser = argparse.ArgumentParser(
101        description="LLMScraper - Intelligent Web Scraping with AI",
102        formatter_class=argparse.RawDescriptionHelpFormatter,
103        epilog="""
104Examples:
105  # Direct URL and goal
106  llmscraper --url https://example.com --goal "Get all product names and prices"
107  
108  # Natural language prompt
109  llmscraper --prompt "I want to scrape all book titles and prices from books.toscrape.com"
110  
111  # Actor mode (for Apify)
112  llmscraper --url https://example.com --goal "Extract news headlines" --actor
113  
114  # Test the generated script
115  llmscraper --url https://example.com --goal "Get product info" --test
116        """
117    )
118    
119    # Input options
120    input_group = parser.add_mutually_exclusive_group(required=True)
121    input_group.add_argument(
122        '--prompt', 
123        help='Natural language prompt with URL and extraction goal'
124    )
125    input_group.add_argument(
126        '--url',
127        help='Target URL to scrape (use with --goal)'
128    )
129    
130    parser.add_argument(
131        '--goal',
132        help='Extraction goal (required when using --url)'
133    )
134    
135    # API keys
136    parser.add_argument(
137        '--apify-token',
138        help='Apify API token (or set APIFY_TOKEN env var)'
139    )
140    parser.add_argument(
141        '--claude-key',
142        help='Claude API key (or set CLAUDE_API_KEY env var)'
143    )
144    
145    # Output options
146    parser.add_argument(
147        '--output', '-o',
148        default='generated_scraper.py',
149        help='Output file path for generated script (default: generated_scraper.py)'
150    )
151    parser.add_argument(
152        '--actor',
153        action='store_true',
154        help='Generate script for Apify actor format'
155    )
156    
157    # Execution options
158    parser.add_argument(
159        '--test',
160        action='store_true',
161        help='Test the generated script before saving'
162    )
163    parser.add_argument(
164        '--no-prune-eval',
165        action='store_true',
166        help='Disable HTML pruning before quality evaluation'
167    )
168    
169    # Logging
170    parser.add_argument(
171        '--verbose', '-v',
172        action='store_true',
173        help='Enable verbose logging'
174    )
175    parser.add_argument(
176        '--quiet', '-q',
177        action='store_true',
178        help='Suppress non-error output'
179    )
180    
181    args = parser.parse_args()
182    
183    # Setup logging
184    if args.quiet:
185        log_level = "ERROR"
186    elif args.verbose:
187        log_level = "DEBUG"
188    else:
189        log_level = "INFO"
190    
191    setup_logging(log_level)
192    logger = logging.getLogger(__name__)
193    
194    # Validate arguments
195    if args.url and not args.goal:
196        logger.error("--goal is required when using --url")
197        return 1
198    
199    # Get API keys
200    apify_token = get_api_key("APIFY_TOKEN", args.apify_token)
201    claude_key = get_api_key("CLAUDE_API_KEY", args.claude_key)
202    
203    if not apify_token:
204        logger.error("APIFY_TOKEN is required (set environment variable or use --apify-token)")
205        return 1
206    
207    if not claude_key:
208        logger.error("CLAUDE_API_KEY is required (set environment variable or use --claude-key)")
209        return 1
210    
211    # Process input
212    if args.prompt:
213        logger.info("🔍 Extracting URL and goal from natural language prompt...")
214        extraction_result = extract_goal_from_prompt(args.prompt, claude_key)
215        
216        if not extraction_result.success:
217            logger.error(f"Failed to extract goal from prompt: {extraction_result.error_message}")
218            return 1
219        
220        target_url = extraction_result.url
221        user_goal = extraction_result.goal
222        
223        logger.info(f"📍 Extracted URL: {target_url}")
224        logger.info(f"🎯 Extracted Goal: {user_goal}")
225        
226    else:  # args.url and args.goal
227        target_url = args.url
228        user_goal = args.goal
229    
230    # Run the pipeline
231    logger.info("🚀 Starting intelligent scraper pipeline...")
232    
233    result = run_intelligent_scraper(
234        target_url=target_url,
235        user_goal=user_goal,
236        apify_token=apify_token,
237        claude_api_key=claude_key,
238        output_path=None if args.actor else args.output,
239        prune_before_evaluation=not args.no_prune_eval,
240        test_script=args.test,
241        for_actor=args.actor
242    )
243    
244    # Handle results
245    if result.success:
246        logger.info("✅ SUCCESS! Pipeline completed successfully!")
247        logger.info(f"🏆 Best actor: {result.best_actor}")
248        
249        if not args.actor:
250            logger.info(f"📁 Generated script: {args.output}")
251        
252        if result.quality_scores:
253            logger.info("📊 Quality scores:")
254            for actor, score in result.quality_scores.items():
255                emoji = "🟢" if score >= 7 else "🟡" if score >= 4 else "🔴"
256                logger.info(f"   {emoji} {actor}: {score}/10")
257        
258        if args.test and result.extracted_data:
259            data_count = len(result.extracted_data) if isinstance(result.extracted_data, list) else 1
260            logger.info(f"🧪 Test extraction successful: {data_count} items")
261        
262        if args.actor:
263            print("\\n" + "="*60)
264            print("📄 GENERATED ACTOR SCRIPT")
265            print("="*60)
266            print(result.generated_script)
267        else:
268            logger.info("\\n🚀 Next steps:")
269            logger.info(f"   1. Review the generated script: {args.output}")
270            logger.info(f"   2. Run the scraper: python {args.output}")
271            logger.info("   3. Check the extracted JSON data")
272        
273        return 0
274        
275    else:
276        logger.error(f"❌ FAILED: {result.error_message}")
277        if result.quality_scores:
278            logger.info("📊 Quality scores achieved:")
279            for actor, score in result.quality_scores.items():
280                emoji = "🟢" if score >= 7 else "🟡" if score >= 4 else "🔴"
281                logger.info(f"   {emoji} {actor}: {score}/10")
282        return 1
283
284
285if __name__ == "__main__":
286    sys.exit(main())

src/llmscraper/models.py

1"""
2Data models for the ScraperCodeGenerator pipeline.
3"""
4
5from dataclasses import dataclass
6from typing import Dict, Any, Optional, List
7
8
9@dataclass
10class ScrapingResult:
11    """Result of the complete scraping pipeline."""
12    success: bool
13    generated_script: Optional[str] = None
14    best_actor: Optional[str] = None
15    schema: Optional[Dict[str, Any]] = None
16    error_message: Optional[str] = None
17    quality_scores: Optional[Dict[str, int]] = None
18    extracted_data: Optional[List[Dict[str, Any]]] = None
19
20
21@dataclass
22class EvaluationResult:
23    """Result of HTML quality evaluation."""
24    score: int  # 1-10 scale
25    reasoning: str
26
27
28@dataclass
29class PreEvaluationResult:
30    """Result of pre-evaluation checks before sending to Claude."""
31    is_valid_html: bool
32    score: Optional[int] = None  # If we can determine score without Claude
33    reasoning: Optional[str] = None
34    should_continue_to_claude: bool = True
35
36
37@dataclass
38class GoalExtractionResult:
39    """Result of extracting goal from natural language prompt."""
40    goal: str
41    url: str
42    success: bool
43    error_message: Optional[str] = None

src/llmscraper/pipeline.py

1"""
2Main pipeline for intelligent web scraping.
3"""
4
5import logging
6from typing import Optional
7
8from .models import ScrapingResult
9from .scraping import MultiActorScraper
10from .scraping.actor_multi_scraper import ActorMultiScraper
11from .evaluation import HTMLQualityEvaluator
12from .generation import ScriptGenerator, ScriptExecutor
13from .utils import prune_html, validate_required_keys, get_api_key
14
15
16class IntelligentScraperPipeline:
17    """Main pipeline class that orchestrates the intelligent web scraping process."""
18    
19    def __init__(self, apify_token: str, claude_api_key: str, actor_logger=None):
20        """
21        Initialize the pipeline with required API tokens.
22        
23        Args:
24            apify_token: Apify API token for web scraping
25            claude_api_key: Anthropic Claude API key for AI analysis
26            actor_logger: Optional Actor logger for actor mode
27        """
28        # Validate API keys
29        validated_keys = validate_required_keys(
30            apify_token=apify_token,
31            claude_api_key=claude_api_key
32        )
33        
34        self.apify_token = validated_keys['apify_token']
35        self.claude_api_key = validated_keys['claude_api_key']
36        
37        # Initialize components
38        self.multi_scraper = MultiActorScraper(self.apify_token)
39        self.actor_scraper = ActorMultiScraper()  # For actor-to-actor communication
40        self.quality_evaluator = HTMLQualityEvaluator(self.claude_api_key)
41        self.script_generator = ScriptGenerator(self.claude_api_key)
42        self.script_executor = ScriptExecutor()
43        
44        # Setup logging - use Actor logger if provided, otherwise standard logging
45        self.logger = actor_logger if actor_logger else logging.getLogger(__name__)
46        self.is_actor_mode = actor_logger is not None
47    
48    async def run_complete_pipeline(self, target_url: str, user_goal: str, 
49                            output_script_path: Optional[str] = None,
50                            prune_before_evaluation: bool = True,
51                            test_script: bool = False,
52                            for_actor: bool = False) -> ScrapingResult:
53        """
54        Run the complete intelligent scraping pipeline.
55        
56        Args:
57            target_url: The URL to scrape
58            user_goal: Natural language description of what to extract
59            output_script_path: Path where to save the generated script (None for actor mode)
60            prune_before_evaluation: If True, prune HTML before quality evaluation
61            test_script: If True, test the generated script before finalizing
62            for_actor: If True, generate script for Apify actor format
63            
64        Returns:
65            ScrapingResult containing the outcome and generated artifacts
66        """
67        self.logger.info(f"PIPELINE: Starting intelligent scraping pipeline for: {target_url}")
68        self.logger.info(f"PIPELINE: User goal: {user_goal}")
69        self.logger.info(f"PIPELINE: Actor mode: {for_actor}")
70        
71        try:
72            # Step 1: Run multiple actors to scrape the website
73            self.logger.info("PIPELINE: Step 1: Running multi-actor scraping...")
74            
75            # Use actor-aware scraper if running inside an Apify actor
76            if for_actor:
77                self.logger.info("PIPELINE: Using actor-to-actor communication...")
78                scraping_results = await self.actor_scraper.scrape_with_multiple_actors(target_url)
79            else:
80                self.logger.info("PIPELINE: Using client-based scraping...")
81                scraping_results = self.multi_scraper.scrape_with_multiple_actors(target_url)
82            
83            if not any(content for content in scraping_results.values() if content):
84                return ScrapingResult(
85                    success=False,
86                    error_message="All scraping actors failed to retrieve content"
87                )
88            
89            # Step 2: Evaluate quality of each result
90            self.logger.info("PIPELINE: Step 2: Evaluating HTML quality for each actor...")
91            quality_scores, best_actor, best_html = self._evaluate_html_quality(
92                scraping_results, user_goal, prune_before_evaluation
93            )
94            
95            if not best_html:
96                return ScrapingResult(
97                    success=False,
98                    error_message="No actor produced quality HTML content",
99                    quality_scores=quality_scores
100                )
101            
102            self.logger.info(f"PIPELINE: Best actor selected: {best_actor} with score {quality_scores[best_actor]}/10")
103            
104            # Step 3: Prune the best HTML to reduce token count
105            self.logger.info("PIPELINE: Step 3: Pruning HTML content...")
106            pruned_html = prune_html(best_html, max_list_items=4, max_text_length=500)
107            
108            original_length = len(best_html)
109            pruned_length = len(pruned_html)
110            reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0
111            
112            self.logger.info(f"PIPELINE: HTML pruned: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")
113            
114            # Step 4: Generate Python scraping script
115            self.logger.info("PIPELINE: Step 4: Generating Python scraping script...")
116            generated_script = self.script_generator.generate_scraping_script(
117                target_url, best_actor, pruned_html, user_goal, for_actor
118            )
119            
120            if not generated_script:
121                return ScrapingResult(
122                    success=False,
123                    error_message="Failed to generate scraping script",
124                    best_actor=best_actor,
125                    quality_scores=quality_scores
126                )
127            
128            # Step 5: Test the script if requested
129            extracted_data = None
130            if test_script:
131                self.logger.info("PIPELINE: Step 5: Testing generated script...")
132                test_result = self.script_executor.test_script(generated_script, best_html)
133                
134                if test_result["success"]:
135                    self.logger.info(f"PIPELINE: ✅ Script test passed! Extracted {test_result.get('item_count', 0)} items")
136                    extracted_data = test_result["data"]
137                else:
138                    self.logger.warning(f"PIPELINE: ⚠️ Script test failed: {test_result['error']}")
139                    # Continue anyway, but log the issue
140            
141            # Step 6: Save the generated script (only if not actor mode)
142            if output_script_path and not for_actor:
143                self.logger.info(f"PIPELINE: Step 6: Saving generated script to {output_script_path}")
144                with open(output_script_path, 'w', encoding='utf-8') as f:
145                    f.write(generated_script)
146            
147            self.logger.info("PIPELINE: ✅ Pipeline completed successfully!")
148            
149            return ScrapingResult(
150                success=True,
151                generated_script=generated_script,
152                best_actor=best_actor,
153                quality_scores=quality_scores,
154                extracted_data=extracted_data
155            )
156            
157        except Exception as e:
158            self.logger.error(f"PIPELINE: Pipeline failed with error: {str(e)}")
159            return ScrapingResult(
160                success=False,
161                error_message=f"Pipeline error: {str(e)}"
162            )
163    
164    def _evaluate_html_quality(self, scraping_results: dict, user_goal: str, 
165                             prune_before_evaluation: bool) -> tuple[dict, str, str]:
166        """Evaluate HTML quality for each scraping result."""
167        quality_scores = {}
168        best_actor = None
169        best_html = None
170        best_score = 0
171        
172        for actor_name, html_content in scraping_results.items():
173            if html_content:
174                self.logger.info(f"PIPELINE: Evaluating {actor_name}...")
175                
176                # Optionally prune HTML before evaluation
177                evaluation_html = html_content
178                if prune_before_evaluation:
179                    original_length = len(html_content)
180                    evaluation_html = prune_html(html_content, max_list_items=3, max_text_length=100)
181                    pruned_length = len(evaluation_html)
182                    reduction = ((original_length - pruned_length) / original_length * 100) if original_length > 0 else 0
183                    self.logger.info(f"PIPELINE:    {actor_name} HTML pruned for evaluation: {original_length:,} → {pruned_length:,} chars ({reduction:.1f}% reduction)")
184                
185                evaluation = self.quality_evaluator.evaluate_html_quality(user_goal, evaluation_html)
186                
187                if evaluation:
188                    quality_scores[actor_name] = evaluation.score
189                    self.logger.info(f"PIPELINE: {actor_name} quality score: {evaluation.score}/10 - {evaluation.reasoning}")
190                    
191                    if evaluation.score > best_score:
192                        best_score = evaluation.score
193                        best_actor = actor_name
194                        best_html = html_content  # Keep original HTML, not pruned version
195                else:
196                    quality_scores[actor_name] = 0
197                    self.logger.warning(f"PIPELINE: Failed to evaluate {actor_name}")
198            else:
199                quality_scores[actor_name] = 0
200                self.logger.warning(f"PIPELINE: {actor_name} returned no content")
201        
202        return quality_scores, best_actor, best_html
203
204
205async def run_intelligent_scraper(target_url: str, user_goal: str, 
206                          apify_token: Optional[str] = None,
207                          claude_api_key: Optional[str] = None,
208                          output_path: Optional[str] = "generated_scraper.py",
209                          prune_before_evaluation: bool = True,
210                          test_script: bool = False,
211                          for_actor: bool = False,
212                          actor_logger=None) -> ScrapingResult:
213    """
214    Convenience function to run the complete intelligent scraping pipeline.
215    
216    Args:
217        target_url: URL to scrape
218        user_goal: Natural language description of extraction goal
219        apify_token: Apify API token (uses APIFY_TOKEN env var if not provided)
220        claude_api_key: Claude API key (uses CLAUDE_API_KEY env var if not provided)
221        output_path: Path to save the generated script (None for actor mode)
222        prune_before_evaluation: If True, prune HTML before quality evaluation
223        test_script: If True, test the generated script before finalizing
224        for_actor: If True, generate script for Apify actor format
225        actor_logger: Optional Actor logger for actor mode
226        
227    Returns:
228        ScrapingResult with the outcome
229    """
230    # Get tokens from environment if not provided
231    if not apify_token:
232        apify_token = get_api_key("APIFY_TOKEN")
233    if not claude_api_key:
234        claude_api_key = get_api_key("CLAUDE_API_KEY")
235    
236    if not apify_token:
237        return ScrapingResult(
238            success=False,
239            error_message="APIFY_TOKEN not provided and not found in environment variables"
240        )
241    
242    if not claude_api_key:
243        return ScrapingResult(
244            success=False,
245            error_message="CLAUDE_API_KEY not provided and not found in environment variables"
246        )
247    
248    # Create and run pipeline
249    pipeline = IntelligentScraperPipeline(apify_token, claude_api_key, actor_logger)
250    return await pipeline.run_complete_pipeline(
251        target_url, user_goal, output_path, prune_before_evaluation, test_script, for_actor
252    )

src/llmscraper/tempCodeRunnerFile.py

1generated_scraper

src/llmscraper/evaluation/init.py

1"""
2Evaluation module for ScraperCodeGenerator.
3"""
4
5from .html_quality_evaluator import HTMLQualityEvaluator
6
7__all__ = ["HTMLQualityEvaluator"]

src/llmscraper/evaluation/html_quality_evaluator.py

1"""
2HTML quality evaluation using Claude AI.
3"""
4
5import json
6import logging
7import re
8from typing import Optional
9
10import anthropic
11
12from ..models import EvaluationResult, PreEvaluationResult
13
14
15class HTMLQualityEvaluator:
16    """Evaluates HTML quality for web scraping using Claude AI."""
17    
18    def __init__(self, claude_api_key: str):
19        """Initialize with Claude API key."""
20        if not claude_api_key or not claude_api_key.strip():
21            raise ValueError("Claude API key cannot be empty")
22        
23        self.client = anthropic.Anthropic(api_key=claude_api_key)
24        self.logger = logging.getLogger(__name__)
25    
26    def evaluate_html_quality(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
27        """
28        Evaluate HTML quality for data extraction.
29        
30        Args:
31            user_goal: User's extraction goal
32            html_content: HTML content to evaluate
33            
34        Returns:
35            EvaluationResult or None if evaluation fails
36        """
37        try:
38            # Pre-evaluation checks
39            pre_eval = self._pre_evaluate_html(html_content)
40            if not pre_eval.should_continue_to_claude:
41                if pre_eval.score is not None:
42                    return EvaluationResult(score=pre_eval.score, reasoning=pre_eval.reasoning)
43                return None
44            
45            # Claude evaluation
46            return self._evaluate_with_claude(user_goal, html_content)
47            
48        except Exception as e:
49            self.logger.error(f"Error evaluating HTML quality: {str(e)}")
50            return None
51    
52    def _pre_evaluate_html(self, html_content: str) -> PreEvaluationResult:
53        """Perform basic HTML validation checks."""
54        if not html_content or not html_content.strip():
55            return PreEvaluationResult(
56                is_valid_html=False,
57                score=1,
58                reasoning="Empty or whitespace-only HTML content",
59                should_continue_to_claude=False
60            )
61        
62        # Check for common failure indicators
63        content_lower = html_content.lower()
64        
65        # Bot detection/blocking indicators
66        blocking_indicators = [
67            'please verify you are a human',
68            'access denied',
69            'blocked',
70            'captcha',
71            'cloudflare',
72            'ddos protection',
73            'security check',
74            'bot detected'
75        ]
76        
77        for indicator in blocking_indicators:
78            if indicator in content_lower:
79                return PreEvaluationResult(
80                    is_valid_html=False,
81                    score=1,
82                    reasoning=f"HTML appears to be blocked/bot-detected (found: '{indicator}')",
83                    should_continue_to_claude=False
84                )
85        
86        # Check for minimal HTML structure
87        if not re.search(r'<html|<body|<div|<p|<span', content_lower):
88            return PreEvaluationResult(
89                is_valid_html=False,
90                score=2,
91                reasoning="HTML lacks basic structural elements",
92                should_continue_to_claude=False
93            )
94        
95        return PreEvaluationResult(
96            is_valid_html=True,
97            should_continue_to_claude=True
98        )
99    
100    def _evaluate_with_claude(self, user_goal: str, html_content: str) -> Optional[EvaluationResult]:
101        """Evaluate HTML using Claude AI."""
102        try:
103            prompt = self._create_evaluation_prompt(user_goal, html_content)
104            
105            response = self.client.messages.create(
106                model="claude-3-5-sonnet-20241022",
107                max_tokens=500,
108                messages=[{"role": "user", "content": prompt}]
109            )
110            
111            content = response.content[0].text
112            return self._parse_evaluation_response(content)
113            
114        except Exception as e:
115            self.logger.error(f"Error in Claude evaluation: {str(e)}")
116            return None
117    
118    def _create_evaluation_prompt(self, user_goal: str, html_content: str) -> str:
119        """Create the evaluation prompt for Claude."""
120        return f"""You are an expert web scraper evaluator. Analyze the provided HTML and determine how suitable it is for extracting the requested data.
121
122USER EXTRACTION GOAL:
123{user_goal}
124
125HTML CONTENT TO EVALUATE:
126{html_content}
127
128Evaluate the HTML on a scale of 1-10 based on:
1291. Presence of the target data elements
1302. HTML structure quality and accessibility
1313. Whether the page loaded correctly (not blocked, error page, etc.)
1324. How easy it would be to extract the requested data
133
134Return your evaluation in this EXACT JSON format:
135{{
136    "score": [1-10 integer],
137    "reasoning": "[brief explanation of the score]"
138}}
139
140Only return the JSON, no other text.
141"""
142    
143    def _parse_evaluation_response(self, response: str) -> Optional[EvaluationResult]:
144        """Parse Claude's evaluation response."""
145        try:
146            # Extract JSON from response
147            json_match = re.search(r'\{.*\}', response, re.DOTALL)
148            if not json_match:
149                raise ValueError("No JSON found in response")
150            
151            data = json.loads(json_match.group())
152            
153            score = data.get('score')
154            reasoning = data.get('reasoning', '')
155            
156            if not isinstance(score, int) or score < 1 or score > 10:
157                raise ValueError(f"Invalid score: {score}")
158            
159            return EvaluationResult(score=score, reasoning=reasoning)
160            
161        except Exception as e:
162            self.logger.error(f"Error parsing evaluation response: {str(e)}")
163            return None

src/llmscraper/generation/init.py

1"""
2Generation module for ScraperCodeGenerator.
3"""
4
5from .script_generator import ScriptGenerator
6from .script_executor import ScriptExecutor
7
8__all__ = ["ScriptGenerator", "ScriptExecutor"]

src/llmscraper/generation/script_executor.py

1"""
2Script execution and testing functionality.
3"""
4
5import subprocess
6import tempfile
7import os
8import json
9import logging
10from typing import Dict, Any, Optional
11import ast
12import traceback
13
14
15class ScriptExecutor:
16    """Executes and tests generated scraping scripts."""
17    
18    def __init__(self):
19        """Initialize the script executor."""
20        self.logger = logging.getLogger(__name__)
21    
22    def test_script(self, script_content: str, html_content: str) -> Dict[str, Any]:
23        """
24        Test a scraping script against sample HTML content.
25        
26        Args:
27            script_content: The Python script to test
28            html_content: Sample HTML to test against
29            
30        Returns:
31            Dict with test results including success, data, and errors
32        """
33        try:
34            # Extract the extract_data function from the script
35            extract_function = self._extract_function_from_script(script_content, 'extract_data')
36            
37            if not extract_function:
38                return {
39                    "success": False,
40                    "error": "Could not find extract_data function in script",
41                    "data": None
42                }
43            
44            # Create a safe execution environment
45            safe_globals = {
46                '__builtins__': {
47                    'len': len,
48                    'str': str,
49                    'int': int,
50                    'float': float,
51                    'bool': bool,
52                    'list': list,
53                    'dict': dict,
54                    'range': range,
55                    'enumerate': enumerate,
56                    'zip': zip,
57                    'isinstance': isinstance,
58                    'hasattr': hasattr,
59                    'getattr': getattr,
60                    'print': print,
61                }
62            }
63            
64            # Import necessary modules into the environment
65            exec("from bs4 import BeautifulSoup", safe_globals)
66            exec("import re", safe_globals)
67            exec("import json", safe_globals)
68            
69            # Execute the function definition
70            exec(extract_function, safe_globals)
71            
72            # Call the function with the HTML content
73            extracted_data = safe_globals['extract_data'](html_content)
74            
75            return {
76                "success": True,
77                "data": extracted_data,
78                "error": None,
79                "data_type": type(extracted_data).__name__,
80                "item_count": len(extracted_data) if isinstance(extracted_data, (list, dict)) else 1
81            }
82            
83        except Exception as e:
84            self.logger.error(f"Error testing script: {str(e)}")
85            return {
86                "success": False,
87                "error": str(e),
88                "data": None,
89                "traceback": traceback.format_exc()
90            }
91    
92    def _extract_function_from_script(self, script_content: str, function_name: str) -> Optional[str]:
93        """Extract a specific function from a script."""
94        try:
95            # Parse the script into an AST
96            tree = ast.parse(script_content)
97            
98            # Find the function definition
99            for node in ast.walk(tree):
100                if isinstance(node, ast.FunctionDef) and node.name == function_name:
101                    # Get the source code of the function
102                    lines = script_content.split('\n')
103                    start_line = node.lineno - 1
104                    
105                    # Find the end of the function
106                    end_line = start_line + 1
107                    while end_line < len(lines):
108                        line = lines[end_line]
109                        # Check if this line starts a new function or class
110                        if line.strip() and not line.startswith(' ') and not line.startswith('\t'):
111                            break
112                        end_line += 1
113                    
114                    return '\n'.join(lines[start_line:end_line])
115            
116            return None
117            
118        except Exception as e:
119            self.logger.error(f"Error extracting function: {str(e)}")
120            return None
121    
122    def validate_script_syntax(self, script_content: str) -> Dict[str, Any]:
123        """
124        Validate the syntax of a Python script.
125        
126        Args:
127            script_content: The Python script to validate
128            
129        Returns:
130            Dict with validation results
131        """
132        try:
133            # Try to parse the script
134            ast.parse(script_content)
135            
136            return {
137                "valid": True,
138                "error": None
139            }
140            
141        except SyntaxError as e:
142            return {
143                "valid": False,
144                "error": f"Syntax error: {str(e)}",
145                "line": e.lineno,
146                "offset": e.offset
147            }
148        except Exception as e:
149            return {
150                "valid": False,
151                "error": f"Parse error: {str(e)}"
152            }
153    
154    def run_script_in_sandbox(self, script_content: str, timeout: int = 60) -> Dict[str, Any]:
155        """
156        Run a complete script in a sandboxed environment.
157        
158        Args:
159            script_content: The complete Python script
160            timeout: Maximum execution time in seconds
161            
162        Returns:
163            Dict with execution results
164        """
165        try:
166            # Create a temporary file
167            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:
168                temp_file.write(script_content)
169                temp_file_path = temp_file.name
170            
171            try:
172                # Run the script
173                result = subprocess.run(
174                    ['python', temp_file_path],
175                    capture_output=True,
176                    text=True,
177                    timeout=timeout,
178                    cwd=os.path.dirname(temp_file_path)
179                )
180                
181                return {
182                    "success": result.returncode == 0,
183                    "stdout": result.stdout,
184                    "stderr": result.stderr,
185                    "return_code": result.returncode
186                }
187                
188            finally:
189                # Clean up the temporary file
190                os.unlink(temp_file_path)
191                
192        except subprocess.TimeoutExpired:
193            return {
194                "success": False,
195                "stdout": "",
196                "stderr": f"Script execution timed out after {timeout} seconds",
197                "return_code": -1
198            }
199        except Exception as e:
200            return {
201                "success": False,
202                "stdout": "",
203                "stderr": str(e),
204                "return_code": -1
205            }

src/llmscraper/generation/script_generator.py

1"""
2Code generation functionality for creating scraping scripts.
3"""
4
5import logging
6from typing import Optional
7import re
8
9import anthropic
10
11
12class ScriptGenerator:
13    """Generates Python scraping scripts using Claude AI."""
14    
15    def __init__(self, claude_api_key: str):
16        """Initialize with Claude API key."""
17        if not claude_api_key or not claude_api_key.strip():
18            raise ValueError("Claude API key cannot be empty")
19        
20        self.client = anthropic.Anthropic(api_key=claude_api_key)
21        self.logger = logging.getLogger(__name__)
22    
23    def generate_scraping_script(self, target_url: str, best_actor: str, 
24                               pruned_html: str, user_goal: str, 
25                               for_actor: bool = False) -> Optional[str]:
26        """
27        Generate a complete Python scraping script.
28        
29        Args:
30            target_url: The target URL to scrape
31            best_actor: Name of the best performing actor
32            pruned_html: Sample HTML content for reference
33            user_goal: User's extraction goal
34            for_actor: If True, generate for Apify actor (key-value store output)
35            
36        Returns:
37            Complete Python script as string, or None if generation fails
38        """
39        try:
40            # Generate the HTML parsing code from Claude
41            parsing_code = self._generate_html_parsing_code(pruned_html, user_goal)
42            
43            if not parsing_code:
44                self.logger.error("Failed to generate HTML parsing code")
45                return None
46            
47            # Create the complete script
48            if for_actor:
49                return self._create_actor_script(target_url, best_actor, parsing_code, user_goal)
50            else:
51                return self._create_standalone_script(target_url, best_actor, parsing_code, user_goal)
52            
53        except Exception as e:
54            self.logger.error(f"Error generating scraping script: {str(e)}")
55            return None
56    
57    def _generate_html_parsing_code(self, pruned_html: str, user_goal: str) -> Optional[str]:
58        """Generate HTML parsing/extraction code using Claude."""
59        try:
60            prompt = f"""Generate Python code that parses HTML content and extracts data based on the user's goal. You should generate ONLY the extraction logic, not a complete script.
61
62## USER GOAL:
63{user_goal}
64
65## SAMPLE HTML (for reference):
66{pruned_html}
67
68## REQUIREMENTS:
691. Create a function called `extract_data(html_content)` that takes HTML string as input
702. Use BeautifulSoup to parse the HTML
713. Extract the data according to the user's goal using CSS selectors, attributes, text content, etc.
724. Return the extracted data as a Python dictionary or list of dictionaries
735. Handle missing or malformed data gracefully
746. Include appropriate error handling
75
76## EXAMPLE OUTPUT FORMAT:
77```python
78def extract_data(html_content):
79    from bs4 import BeautifulSoup
80    
81    soup = BeautifulSoup(html_content, 'html.parser')
82    results = []
83    
84    # Your extraction logic here
85    # Use soup.find(), soup.find_all(), CSS selectors, etc.
86    
87    return results
88```
89
90Generate ONLY the `extract_data` function and any helper functions needed. Do not include imports outside the function, full scripts, or other boilerplate code."""
91
92            self.logger.info("Requesting HTML parsing code generation from Claude...")
93            
94            response = self.client.messages.create(
95                model="claude-3-5-sonnet-20241022",
96                max_tokens=2000,
97                messages=[{"role": "user", "content": prompt}]
98            )
99            
100            parsing_code = response.content[0].text
101            
102            # Extract Python code from response if wrapped in code blocks
103            if "```python" in parsing_code:
104                code_match = re.search(r'```python\n(.*?)\n```', parsing_code, re.DOTALL)
105                if code_match:
106                    parsing_code = code_match.group(1)
107            
108            return parsing_code
109            
110        except Exception as e:
111            self.logger.error(f"Error generating HTML parsing code: {str(e)}")
112            return None
113    
114    def _create_standalone_script(self, target_url: str, best_actor: str, 
115                                parsing_code: str, user_goal: str) -> str:
116        """Create a standalone Python script."""
117        return f'''#!/usr/bin/env python3
118"""
119Generated Web Scraper
120Target: {target_url}
121Goal: {user_goal}
122Best Actor: {best_actor}
123Generated by: ScraperCodeGenerator
124"""
125
126import os
127import json
128import logging
129from typing import Dict, Any, List, Optional
130from bs4 import BeautifulSoup
131
132from llmscraper.scraping import MultiActorScraper
133
134
135{parsing_code}
136
137
138def main():
139    """Main function to run the scraper."""
140    # Setup logging
141    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
142    logger = logging.getLogger(__name__)
143    
144    # Configuration
145    target_url = "{target_url}"
146    apify_token = os.getenv("APIFY_TOKEN")
147    
148    if not apify_token:
149        logger.error("APIFY_TOKEN environment variable not set")
150        logger.info("Please set your Apify API token: export APIFY_TOKEN='your_token_here'")
151        return
152    
153    try:
154        logger.info(f"🚀 Starting scraper for: {{target_url}}")
155        logger.info(f"📝 Goal: {user_goal}")
156        logger.info(f"🏆 Using best actor: {best_actor}")
157        
158        # Initialize the multi-actor scraper
159        scraper = MultiActorScraper(apify_token)
160        
161        # Get the actor configuration for the best performing actor
162        actor_configs = scraper._get_actor_configs(target_url)
163        
164        if "{best_actor}" not in actor_configs:
165            logger.error(f"Best actor '{best_actor}' not found in available actors")
166            return
167        
168        # Run the best actor to get HTML
169        logger.info(f"🔄 Running {{'{best_actor}'}} actor...")
170        actor_name, html_content = scraper._run_single_actor("{best_actor}", actor_configs["{best_actor}"])
171        
172        if not html_content:
173            logger.error(f"Failed to get HTML content from {{'{best_actor}'}} actor")
174            return
175        
176        logger.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")
177        
178        # Extract data using the generated parsing code
179        logger.info("🔍 Extracting data from HTML...")
180        extracted_data = extract_data(html_content)
181        
182        if not extracted_data:
183            logger.warning("No data was extracted from the HTML")
184            return
185        
186        # Prepare final results
187        results = {{
188            "target_url": target_url,
189            "extraction_goal": "{user_goal}",
190            "actor_used": "{best_actor}",
191            "data": extracted_data,
192            "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1
193        }}
194        
195        # Output results
196        print("\\n" + "="*60)
197        print("📊 EXTRACTION RESULTS")
198        print("="*60)
199        print(json.dumps(results, indent=2, ensure_ascii=False))
200        
201        # Save to file
202        output_file = "extracted_data.json"
203        with open(output_file, 'w', encoding='utf-8') as f:
204            json.dump(results, f, indent=2, ensure_ascii=False)
205        
206        logger.info(f"💾 Results saved to {{output_file}}")
207        logger.info(f"🎉 Successfully extracted {{results['total_items']}} items!")
208        
209    except Exception as e:
210        logger.error(f"❌ Scraping failed: {{e}}")
211        import traceback
212        traceback.print_exc()
213
214
215if __name__ == "__main__":
216    main()
217'''
218    
219    def _create_actor_script(self, target_url: str, best_actor: str, 
220                           parsing_code: str, user_goal: str) -> str:
221        """Create a script for Apify actor."""
222        return f'''"""
223Apify Actor Script
224Target: {target_url}
225Goal: {user_goal}
226Best Actor: {best_actor}
227Generated by: ScraperCodeGenerator
228"""
229
230import json
231from apify import Actor
232from bs4 import BeautifulSoup
233
234from llmscraper.scraping import MultiActorScraper
235
236
237{parsing_code}
238
239
240async def main():
241    """Main actor function."""
242    async with Actor:
243        # Get input
244        actor_input = await Actor.get_input() or {{}}
245        target_url = actor_input.get('targetUrl', '{target_url}')
246        user_goal = actor_input.get('userGoal', '{user_goal}')
247        apify_token = actor_input.get('apifyToken') or Actor.config.token
248        
249        Actor.log.info(f"🚀 Starting scraper for: {{target_url}}")
250        Actor.log.info(f"📝 Goal: {{user_goal}}")
251        Actor.log.info(f"🏆 Using best actor: {best_actor}")
252        
253        try:
254            # Initialize the multi-actor scraper
255            scraper = MultiActorScraper(apify_token)
256            
257            # Run the best actor to get HTML
258            Actor.log.info(f"🔄 Running {best_actor} actor...")
259            actor_name, html_content = scraper._run_single_actor(
260                "{best_actor}", 
261                scraper._get_actor_configs(target_url)["{best_actor}"]
262            )
263            
264            if not html_content:
265                await Actor.fail(f"Failed to get HTML content from {best_actor} actor")
266                return
267            
268            Actor.log.info(f"✅ Retrieved HTML content: {{len(html_content):,}} characters")
269            
270            # Extract data using the generated parsing code
271            Actor.log.info("🔍 Extracting data from HTML...")
272            extracted_data = extract_data(html_content)
273            
274            if not extracted_data:
275                Actor.log.warning("No data was extracted from the HTML")
276                extracted_data = []
277            
278            # Prepare final results
279            results = {{
280                "target_url": target_url,
281                "extraction_goal": user_goal,
282                "actor_used": "{best_actor}",
283                "data": extracted_data,
284                "total_items": len(extracted_data) if isinstance(extracted_data, list) else 1
285            }}
286            
287            # Save to key-value store
288            await Actor.set_value('OUTPUT', results)
289            
290            Actor.log.info(f"🎉 Successfully extracted {{results['total_items']}} items!")
291            Actor.log.info("💾 Results saved to key-value store as 'OUTPUT'")
292            
293        except Exception as e:
294            Actor.log.error(f"❌ Scraping failed: {{e}}")
295            await Actor.fail(str(e))
296
297
298if __name__ == "__main__":
299    import asyncio
300    asyncio.run(main())
301'''

src/llmscraper/scraping/init.py

1"""
2Scraping module for ScraperCodeGenerator.
3"""
4
5from .apify_runner import ApifyRunner
6from .multi_actor_scraper import MultiActorScraper
7from .actor_multi_scraper import ActorMultiScraper
8
9__all__ = ["ApifyRunner", "MultiActorScraper", "ActorMultiScraper"]

src/llmscraper/scraping/actor_multi_scraper.py

1"""
2Apify Actor-specific scraping module for running other actors from within an Apify actor.
3"""
4
5import logging
6from typing import Dict, Any, List, Optional
7from apify import Actor
8
9
10class ActorMultiScraper:
11    """Handles running multiple Apify actors from within an Apify actor context."""
12    
13    def __init__(self):
14        """Initialize the actor scraper."""
15        self.logger = logging.getLogger(__name__)
16    
17    async def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, Optional[str]]:
18        """
19        Run multiple actors to scrape the target URL and return HTML content.
20        
21        Args:
22            target_url: The URL to scrape
23            
24        Returns:
25            Dictionary mapping actor names to their HTML content (or None if failed)
26        """
27        Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors called for {target_url}")
28        results = {}
29        
30        # Define actor configurations
31        actor_configs = self._get_actor_configs(target_url)
32        Actor.log.info(f"DEBUG: Will run {len(actor_configs)} actors: {list(actor_configs.keys())}")
33        
34        # Run each actor and collect results
35        for actor_name, config in actor_configs.items():
36            try:
37                Actor.log.info(f"DEBUG: Starting {actor_name}...")
38                html_content = await self._run_single_actor(actor_name, config)
39                results[actor_name] = html_content
40                
41                if html_content:
42                    Actor.log.info(f"DEBUG: {actor_name} succeeded: {len(html_content):,} characters")
43                else:
44                    Actor.log.warning(f"DEBUG: {actor_name} returned no content")
45                    
46            except Exception as e:
47                Actor.log.error(f"DEBUG: {actor_name} failed: {str(e)}")
48                results[actor_name] = None
49        
50        Actor.log.info(f"DEBUG: ActorMultiScraper.scrape_with_multiple_actors completed. Results: {list(results.keys())}")
51        Actor.log.info(f"DEBUG: Results with content: {[name for name, content in results.items() if content]}")
52        return results
53    
54    def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:
55        """Get configurations for all actors to run."""
56        return {
57            "cheerio-scraper": {
58                "actor_id": "apify/cheerio-scraper",
59                "input": {
60                    "startUrls": [{"url": target_url}],
61                    "maxRequestRetries": 3,
62                    "requestTimeoutSecs": 30,
63                    "maxPagesPerCrawl": 1,
64                    "pageFunction": """
65                    async function pageFunction(context) {
66                        const { request, log, $ } = context;
67                        try {
68                            const title = $('title').text() || '';
69                            const html = $('html').html() || '';
70                            return {
71                                url: request.url,
72                                title: title,
73                                html: html
74                            };
75                        } catch (error) {
76                            log.error('Error in pageFunction:', error);
77                            return {
78                                url: request.url,
79                                title: '',
80                                html: ''
81                            };
82                        }
83                    }
84                    """,
85                    "proxyConfiguration": {"useApifyProxy": True}
86                }
87            },
88            "web-scraper": {
89                "actor_id": "apify/web-scraper",
90                "input": {
91                    "startUrls": [{"url": target_url}],
92                    "maxRequestRetries": 3,
93                    "requestTimeoutSecs": 30,
94                    "maxPagesPerCrawl": 1,
95                    "pageFunction": """
96                    async function pageFunction(context) {
97                        const { request, log, page } = context;
98                        try {
99                            const title = await page.title();
100                            const html = await page.content();
101                            return { url: request.url, title, html };
102                        } catch (error) {
103                            log.error('Error in pageFunction:', error);
104                            return { url: request.url, title: '', html: '' };
105                        }
106                    }
107                    """,
108                    "proxyConfiguration": {"useApifyProxy": True}
109                }
110            },
111            "website-content-crawler": {
112                "actor_id": "apify/website-content-crawler",
113                "input": {
114                    "startUrls": [{"url": target_url}],
115                    "maxRequestsPerCrawl": 1,
116                    "maxCrawlDepth": 0,
117                    "htmlTransformer": "readableText",
118                    "readableTextCharThreshold": 100,
119                    "removeCookieWarnings": True,
120                    "clickElementsCssSelector": "",
121                    "proxyConfiguration": {"useApifyProxy": True}
122                }
123            }
124        }
125    
126    async def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> Optional[str]:
127        """
128        Run a single actor and extract HTML content.
129        
130        Args:
131            actor_name: Name of the actor (for logging)
132            config: Actor configuration including actor_id and input
133            
134        Returns:
135            HTML content as string, or None if failed
136        """
137        try:
138            actor_id = config["actor_id"]
139            actor_input = config["input"]
140            
141            Actor.log.info(f"DEBUG: Calling actor {actor_id}")
142            
143            # Call the actor using Apify SDK - use the exact same pattern as working code
144            run = await Actor.call(
145                actor_id=actor_id,
146                run_input=actor_input
147            )
148            
149            if not run:
150                Actor.log.error(f"DEBUG: Actor {actor_name} failed to start - run is None")
151                return None
152            
153            Actor.log.info(f"DEBUG: Actor {actor_name} run created with ID: {run.id}")
154            Actor.log.info(f"DEBUG: Default dataset ID: {run.default_dataset_id}")
155            
156            # Use the exact same pattern as your working code
157            if run.default_dataset_id:
158                try:
159                    Actor.log.info(f"DEBUG: Getting dataset items for {actor_name}...")
160                    items = (await Actor.apify_client.dataset(run.default_dataset_id).list_items()).items
161                    
162                    if items:
163                        Actor.log.info(f"DEBUG: Found {len(items)} items in dataset for {actor_name}")
164                        
165                        for i, item in enumerate(items):
166                            Actor.log.info(f"DEBUG: Dataset item {i} keys: {list(item.keys()) if isinstance(item, dict) else type(item)}")
167                            
168                            # Look for HTML content in the item
169                            html_content = self._extract_html_from_item(item, actor_name)
170                            if html_content:
171                                Actor.log.info(f"DEBUG: Found HTML content in dataset item {i} for {actor_name}: {len(html_content)} characters")
172                                return html_content
173                    else:
174                        Actor.log.info(f"DEBUG: No dataset items found for {actor_name}")
175                        
176                except Exception as e:
177                    Actor.log.error(f"DEBUG: Dataset retrieval failed for {actor_name}: {e}")
178                    import traceback
179                    Actor.log.error(f"DEBUG: Dataset traceback: {traceback.format_exc()}")
180            
181            # Fallback: Try key-value store (simplified)
182            if run.default_key_value_store_id:
183                try:
184                    Actor.log.info(f"DEBUG: Trying key-value store as fallback for {actor_name}...")
185                    kvs_client = Actor.apify_client.key_value_store(run.default_key_value_store_id)
186                    
187                    # Try common keys that might contain HTML
188                    common_keys = ['OUTPUT', 'RESULTS', 'DATA']
189                    for key_name in common_keys:
190                        try:
191                            record = await kvs_client.get_record(key_name)
192                            if record:
193                                Actor.log.info(f"DEBUG: Found record for key {key_name}")
194                                html_content = self._extract_html_from_record(record, actor_name)
195                                if html_content:
196                                    Actor.log.info(f"DEBUG: Found HTML content in key {key_name} for {actor_name}")
197                                    return html_content
198                        except Exception:
199                            pass  # Key doesn't exist, continue
200                        
201                except Exception as e:
202                    Actor.log.error(f"DEBUG: Key-value store retrieval failed for {actor_name}: {e}")
203            
204            Actor.log.warning(f"DEBUG: No HTML content found for {actor_name}")
205            return None
206            
207        except Exception as e:
208            Actor.log.error(f"DEBUG: Error running {actor_name}: {str(e)}")
209            import traceback
210            Actor.log.error(f"DEBUG: Full traceback: {traceback.format_exc()}")
211            return None
212    
213    def _extract_html_from_item(self, item: Dict[str, Any], actor_name: str) -> Optional[str]:
214        """Extract HTML content from a dataset item."""
215        Actor.log.info(f"DEBUG: Extracting HTML from item for {actor_name}, item keys: {list(item.keys()) if isinstance(item, dict) else 'not a dict'}")
216        
217        # Look for HTML in common fields
218        html_fields = ['html', 'content', 'body', 'pageContent', 'text', 'data']
219        
220        for field in html_fields:
221            if field in item and item[field]:
222                content = item[field]
223                Actor.log.info(f"DEBUG: Found content in field '{field}': {type(content)}, length: {len(content) if isinstance(content, str) else 'N/A'}")
224                
225                if isinstance(content, str) and len(content) > 100:
226                    # Check if it looks like HTML
227                    if '<' in content and '>' in content:
228                        Actor.log.info(f"DEBUG: Found HTML content in field '{field}' for {actor_name}")
229                        return content
230                    elif actor_name == "website-content-crawler":
231                        # For website-content-crawler, text content is also acceptable
232                        Actor.log.info(f"DEBUG: Found text content in field '{field}' for {actor_name}")
233                        html_content = f"<html><body><div>{content}</div></body></html>"
234                        return html_content
235        
236        # For website-content-crawler, look for any text-like content
237        if actor_name == "website-content-crawler":
238            for key, value in item.items():
239                if isinstance(value, str) and len(value) > 50:
240                    Actor.log.info(f"DEBUG: Using text content from field '{key}' for website-content-crawler")
241                    html_content = f"<html><body><div>{value}</div></body></html>"
242                    return html_content
243        
244        Actor.log.info(f"DEBUG: No HTML content found in item for {actor_name}")
245        return None
246    
247    def _extract_html_from_record(self, record: Any, actor_name: str) -> Optional[str]:
248        """Extract HTML content from a key-value store record."""
249        try:
250            # The record might be the content directly or wrapped in a dict
251            content = record
252            
253            if hasattr(record, 'value'):
254                content = record.value
255            elif isinstance(record, dict) and 'value' in record:
256                content = record['value']
257            
258            # If content is a string, check if it's HTML
259            if isinstance(content, str):
260                if len(content) > 100 and ('<' in content or actor_name == "website-content-crawler"):
261                    return content
262            
263            # If content is a dict, look for HTML fields
264            elif isinstance(content, dict):
265                html_content = self._extract_html_from_item(content, actor_name)
266                if html_content:
267                    return html_content
268            
269            return None
270            
271        except Exception as e:
272            Actor.log.debug(f"DEBUG: Error extracting HTML from record for {actor_name}: {e}")
273            return None

src/llmscraper/scraping/apify_runner.py

1"""
2Apify integration for running actors and retrieving results.
3"""
4
5import logging
6from typing import Optional, Dict, Any, List, Union
7
8from apify_client import ApifyClient
9
10
11class ApifyRunner:
12    """Handles running Apify actors and retrieving results."""
13    
14    def __init__(self, api_token: str):
15        """Initialize with API token."""
16        if not api_token or not api_token.strip():
17            raise ValueError("API token cannot be empty")
18        
19        self.client = ApifyClient(api_token)
20        self.logger = logging.getLogger(__name__)
21    
22    def run_actor(self, actor_id: str, actor_input: dict, 
23                  retrieve_from: str = "auto") -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
24        """
25        Run an Apify actor and retrieve results.
26        
27        Args:
28            actor_id: The ID of the Apify actor
29            actor_input: Input configuration for the actor
30            retrieve_from: "auto", "dataset", "key-value-store", or "both"
31            
32        Returns:
33            Retrieved data or None if failed
34        """
35        if not actor_id or not actor_id.strip():
36            raise ValueError("actor_id cannot be empty")
37        
38        if retrieve_from not in ["auto", "dataset", "key-value-store", "both"]:
39            raise ValueError("Invalid retrieve_from option")
40        
41        # Determine storage type
42        if retrieve_from == "auto":
43            retrieve_from = "key-value-store" if "website-content-crawler" in actor_id else "dataset"
44        
45        try:
46            self.logger.info(f"Starting Apify actor: {actor_id}")
47            
48            # Start the actor run
49            run = self.client.actor(actor_id).call(run_input=actor_input)
50            
51            if not run or run.get('status') != 'SUCCEEDED':
52                self.logger.error(f"Actor run failed: {run.get('status') if run else 'No run created'}")
53                return None
54            
55            run_id = run.get('id')
56            self.logger.info(f"Actor run {run_id} completed successfully")
57            
58            # Retrieve results based on type
59            if retrieve_from == "dataset":
60                return self._get_dataset_items(run_id)
61            elif retrieve_from == "key-value-store":
62                return self._get_key_value_store_items(run_id)
63            elif retrieve_from == "both":
64                return {
65                    "dataset": self._get_dataset_items(run_id),
66                    "key_value_store": self._get_key_value_store_items(run_id)
67                }
68            
69        except Exception as e:
70            self.logger.error(f"Error running actor {actor_id}: {str(e)}")
71            return None
72    
73    def _get_dataset_items(self, run_id: str) -> List[Dict[str, Any]]:
74        """Get items from the dataset of a run."""
75        try:
76            dataset_id = self.client.run(run_id).get().get('defaultDatasetId')
77            if not dataset_id:
78                self.logger.warning(f"No dataset found for run {run_id}")
79                return []
80            
81            dataset_items = list(self.client.dataset(dataset_id).iterate_items())
82            self.logger.info(f"Retrieved {len(dataset_items)} items from dataset")
83            return dataset_items
84            
85        except Exception as e:
86            self.logger.error(f"Error retrieving dataset items: {str(e)}")
87            return []
88    
89    def _get_key_value_store_items(self, run_id: str) -> Dict[str, Any]:
90        """Get items from the key-value store of a run."""
91        try:
92            kvs_id = self.client.run(run_id).get().get('defaultKeyValueStoreId')
93            if not kvs_id:
94                self.logger.warning(f"No key-value store found for run {run_id}")
95                return {}
96            
97            kvs = self.client.key_value_store(kvs_id)
98            keys = list(kvs.list_keys())
99            
100            items = {}
101            for key_info in keys:
102                # Handle case where key_info might be a string or dict
103                if isinstance(key_info, dict):
104                    key_name = key_info.get('key')
105                else:
106                    key_name = str(key_info)
107                    
108                if key_name:
109                    try:
110                        value = kvs.get_record(key_name)
111                        if value:
112                            # Handle case where value might be a string or dict
113                            if isinstance(value, dict):
114                                items[key_name] = value.get('value', value)
115                            else:
116                                items[key_name] = value
117                    except Exception as e:
118                        self.logger.warning(f"Failed to retrieve key '{key_name}': {str(e)}")
119            
120            self.logger.info(f"Retrieved {len(items)} items from key-value store")
121            return items
122            
123        except Exception as e:
124            self.logger.error(f"Error retrieving key-value store items: {str(e)}")
125            return {}
126
127
128# Legacy functions for backward compatibility
129def run_apify_actor(actor_id: str, actor_input: dict, *, api_token: str) -> Optional[List[Dict[str, Any]]]:
130    """Legacy function - use ApifyRunner class instead."""
131    runner = ApifyRunner(api_token)
132    result = runner.run_actor(actor_id, actor_input, "dataset")
133    return result if isinstance(result, list) else None
134
135
136def run_apify_actor_with_flexible_retrieval(
137    actor_id: str, actor_input: dict, *, api_token: str, retrieve_from: str = "auto"
138) -> Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]:
139    """Legacy function - use ApifyRunner class instead."""
140    runner = ApifyRunner(api_token)
141    return runner.run_actor(actor_id, actor_input, retrieve_from)

src/llmscraper/scraping/multi_actor_scraper.py

1"""
2Multi-actor scraping functionality.
3"""
4
5import logging
6from typing import Dict, Any
7from concurrent.futures import ThreadPoolExecutor, as_completed
8
9from .apify_runner import ApifyRunner
10
11
12class MultiActorScraper:
13    """Scrapes websites using multiple Apify actors simultaneously."""
14    
15    def __init__(self, api_token: str):
16        """Initialize with Apify API token."""
17        self.api_token = api_token
18        self.runner = ApifyRunner(api_token)
19        self.logger = logging.getLogger(__name__)
20    
21    def scrape_with_multiple_actors(self, target_url: str) -> Dict[str, str]:
22        """
23        Scrape a URL with multiple actors and return HTML content.
24        
25        Args:
26            target_url: URL to scrape
27            
28        Returns:
29            Dict mapping actor names to HTML content
30        """
31        actor_configs = self._get_actor_configs(target_url)
32        results = {}
33        
34        # Use ThreadPoolExecutor for concurrent execution
35        with ThreadPoolExecutor(max_workers=len(actor_configs)) as executor:
36            future_to_actor = {
37                executor.submit(self._run_single_actor, name, config): name
38                for name, config in actor_configs.items()
39            }
40            
41            for future in as_completed(future_to_actor):
42                actor_name = future_to_actor[future]
43                try:
44                    name, html_content = future.result()
45                    results[name] = html_content
46                except Exception as e:
47                    self.logger.error(f"Actor {actor_name} failed: {str(e)}")
48                    results[actor_name] = None
49        
50        return results
51    
52    def _get_actor_configs(self, target_url: str) -> Dict[str, Dict[str, Any]]:
53        """Get actor configurations for the target URL."""
54        return {
55            "cheerio-scraper": {
56                "actor_id": "apify/cheerio-scraper",
57                "input": {
58                    "startUrls": [{"url": target_url}],
59                    "maxRequestRetries": 3,
60                    "requestTimeoutSecs": 30,
61                    "maxRequestsPerCrawl": 1,
62                    "pseudoUrls": [],
63                    "linkSelector": "",
64                    "pageFunction": """
65                    async function pageFunction(context) {
66                        const { request, log, skipLinks, $ } = context;
67                        return {
68                            url: request.url,
69                            title: $('title').text(),
70                            html: $('html').html()
71                        };
72                    }
73                    """,
74                    "proxyConfiguration": {"useApifyProxy": True}
75                }
76            },
77            "web-scraper": {
78                "actor_id": "apify/web-scraper",
79                "input": {
80                    "startUrls": [{"url": target_url}],
81                    "maxRequestRetries": 3,
82                    "requestTimeoutSecs": 30,
83                    "maxPagesPerCrawl": 1,
84                    "pageFunction": """
85                    async function pageFunction(context) {
86                        const { request, log, skipLinks, $ } = context;
87                        return {
88                            url: request.url,
89                            title: $('title').text(),
90                            html: $('html').html()
91                        };
92                    }
93                    """,
94                    "proxyConfiguration": {"useApifyProxy": True}
95                }
96            },
97            "website-content-crawler": {
98                "actor_id": "apify/website-content-crawler",
99                "input": {
100                    "startUrls": [{"url": target_url}],
101                    "maxCrawlPages": 1,
102                    "crawler": "playwright",
103                    "proxyConfiguration": {"useApifyProxy": True}
104                }
105            }
106        }
107    
108    def _run_single_actor(self, actor_name: str, config: Dict[str, Any]) -> tuple[str, str]:
109        """
110        Run a single actor and extract HTML content.
111        
112        Args:
113            actor_name: Name of the actor
114            config: Actor configuration
115            
116        Returns:
117            Tuple of (actor_name, html_content)
118        """
119        try:
120            self.logger.info(f"Starting {actor_name}...")
121            
122            result = self.runner.run_actor(
123                config["actor_id"],
124                config["input"],
125                "auto"
126            )
127            
128            if not result:
129                self.logger.warning(f"{actor_name} returned no results")
130                return actor_name, None
131            
132            # Extract HTML based on result type
133            html_content = self._extract_html_from_result(result, actor_name)
134            
135            if html_content:
136                self.logger.info(f"{actor_name} completed successfully: {len(html_content):,} chars")
137            else:
138                self.logger.warning(f"{actor_name} returned no HTML content")
139            
140            return actor_name, html_content
141            
142        except Exception as e:
143            self.logger.error(f"Error running {actor_name}: {str(e)}")
144            return actor_name, None
145    
146    def _extract_html_from_result(self, result: Any, actor_name: str) -> str:
147        """Extract HTML content from actor result."""
148        try:
149            if isinstance(result, list) and result:
150                # Dataset result
151                item = result[0]
152                return item.get('html') or item.get('content', '')
153            elif isinstance(result, dict):
154                # Key-value store result
155                if 'OUTPUT' in result:
156                    output = result['OUTPUT']
157                    if isinstance(output, dict):
158                        return output.get('html') or output.get('content', '')
159                    elif isinstance(output, str):
160                        return output
161            
162            self.logger.warning(f"Unexpected result format from {actor_name}")
163            return None
164            
165        except Exception as e:
166            self.logger.error(f"Error extracting HTML from {actor_name}: {str(e)}")
167            return None

src/llmscraper/utils/init.py

1"""
2Utilities module for ScraperCodeGenerator.
3"""
4
5from .html_utils import is_html, prune_html, extract_text_content, validate_html_structure
6from .config import get_api_key, validate_required_keys, setup_logging
7
8__all__ = [
9    "is_html",
10    "prune_html", 
11    "extract_text_content",
12    "validate_html_structure",
13    "get_api_key",
14    "validate_required_keys",
15    "setup_logging"
16]

src/llmscraper/utils/config.py

1"""
2Configuration and environment utilities.
3"""
4
5import os
6from typing import Optional
7
8
9def get_api_key(key_name: str, provided_key: Optional[str] = None) -> Optional[str]:
10    """
11    Get API key from provided value or environment variable.
12    
13    Args:
14        key_name: Name of the environment variable
15        provided_key: Explicitly provided key (takes precedence)
16        
17    Returns:
18        API key or None if not found
19    """
20    if provided_key and provided_key.strip():
21        return provided_key.strip()
22    
23    return os.getenv(key_name)
24
25
26def validate_required_keys(**keys) -> dict[str, str]:
27    """
28    Validate that all required API keys are present.
29    
30    Args:
31        **keys: Key-value pairs of key names and values
32        
33    Returns:
34        Dict of validated keys
35        
36    Raises:
37        ValueError: If any required key is missing
38    """
39    validated = {}
40    missing = []
41    
42    for key_name, key_value in keys.items():
43        if not key_value or not key_value.strip():
44            missing.append(key_name)
45        else:
46            validated[key_name] = key_value.strip()
47    
48    if missing:
49        raise ValueError(f"Missing required API keys: {', '.join(missing)}")
50    
51    return validated
52
53
54def setup_logging(level: str = "INFO") -> None:
55    """
56    Setup logging configuration.
57    
58    Args:
59        level: Logging level (DEBUG, INFO, WARNING, ERROR)
60    """
61    import logging
62    
63    logging.basicConfig(
64        level=getattr(logging, level.upper()),
65        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
66        handlers=[
67            logging.StreamHandler()
68        ]
69    )

src/llmscraper/utils/html_utils.py

1"""
2HTML utility functions for processing web content.
3"""
4
5from typing import Optional
6from bs4 import BeautifulSoup, Comment, NavigableString
7import re
8
9
10def is_html(text_content: str) -> bool:
11    """
12    Check if a string is likely HTML content.
13    
14    Args:
15        text_content: The text content to check
16        
17    Returns:
18        True if the content appears to be HTML
19    """
20    if not text_content or not isinstance(text_content, str):
21        return False
22    
23    content_lower = text_content.lower()
24    return '<html>' in content_lower and '<body>' in content_lower
25
26
27def prune_html(html_content: str, max_list_items: int = 5, max_text_length: int = 500) -> str:
28    """
29    Clean and shorten HTML content to reduce token count while preserving structure.
30    
31    Args:
32        html_content: The raw HTML content to process
33        max_list_items: Maximum number of list items to keep
34        max_text_length: Maximum length of text content in any tag
35        
36    Returns:
37        The cleaned and shortened HTML
38    """
39    if not html_content or not isinstance(html_content, str):
40        return ""
41    
42    try:
43        soup = BeautifulSoup(html_content, 'html.parser')
44        
45        # Remove unwanted tags entirely
46        unwanted_tags = ['script', 'style', 'svg', 'noscript', 'meta', 'link']
47        for tag_name in unwanted_tags:
48            for tag in soup.find_all(tag_name):
49                tag.decompose()
50        
51        # Remove HTML comments
52        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
53            comment.extract()
54        
55        # Remove unwanted attributes from all tags
56        allowed_attributes = {'id', 'class', 'href', 'src', 'alt', 'title'}
57        for tag in soup.find_all(True):
58            if hasattr(tag, 'attrs'):
59                tag.attrs = {key: value for key, value in tag.attrs.items() 
60                           if key in allowed_attributes}
61        
62        # Truncate lists and tables
63        list_and_table_tags = ['ul', 'ol', 'table', 'tbody', 'thead']
64        for tag_name in list_and_table_tags:
65            for tag in soup.find_all(tag_name):
66                children = list(tag.children)
67                # Filter out NavigableString objects (text nodes, whitespace)
68                non_text_children = [child for child in children if not isinstance(child, NavigableString)]
69                
70                if len(non_text_children) > max_list_items:
71                    # Keep only the first max_list_items children
72                    for child in non_text_children[max_list_items:]:
73                        child.decompose()
74                    
75                    # Add a comment indicating truncation
76                    if tag.name in ['ul', 'ol']:
77                        truncation_notice = soup.new_tag("li")
78                        truncation_notice.string = f"... ({len(non_text_children) - max_list_items} more items)"
79                        tag.append(truncation_notice)
80                    elif tag.name == 'table':
81                        truncation_notice = soup.new_tag("tr")
82                        td = soup.new_tag("td")
83                        td.string = f"... ({len(non_text_children) - max_list_items} more rows)"
84                        truncation_notice.append(td)
85                        tag.append(truncation_notice)
86        
87        # Truncate long text content
88        for element in soup.find_all(string=True):
89            if isinstance(element, NavigableString) and not isinstance(element, Comment):
90                text = str(element).strip()
91                if len(text) > max_text_length:
92                    element.replace_with(text[:max_text_length] + "...")
93        
94        # Return the cleaned HTML
95        return str(soup)
96        
97    except Exception as e:
98        # If parsing fails, return original content truncated
99        return html_content[:max_text_length * 10] if len(html_content) > max_text_length * 10 else html_content
100
101
102def extract_text_content(html_content: str) -> str:
103    """
104    Extract clean text content from HTML.
105    
106    Args:
107        html_content: HTML content to extract text from
108        
109    Returns:
110        Clean text content
111    """
112    if not html_content:
113        return ""
114    
115    try:
116        soup = BeautifulSoup(html_content, 'html.parser')
117        return soup.get_text(separator=' ', strip=True)
118    except Exception:
119        return html_content
120
121
122def validate_html_structure(html_content: str) -> bool:
123    """
124    Validate basic HTML structure.
125    
126    Args:
127        html_content: HTML content to validate
128        
129    Returns:
130        True if HTML has basic valid structure
131    """
132    if not html_content:
133        return False
134    
135    try:
136        soup = BeautifulSoup(html_content, 'html.parser')
137        
138        # Check for basic HTML elements
139        has_html_tag = soup.find('html') is not None
140        has_body_tag = soup.find('body') is not None
141        has_content = len(soup.get_text(strip=True)) > 0
142        
143        return has_html_tag or has_body_tag or has_content
144        
145    except Exception:
146        return False

ScraperCodeGenerator

ScraperCodeGenerator

You might also like

Flow AI Agent

Natural Language Dataset Query

Lara Translate MCP Server

MCP Stress Tester

Calculator MCP server

Playwright MCP Server

Firecrawl MCP Server

Cars Com Unlimited

Mastra.ai MCP Agent

Apify Email Signature Generator

Related articles

.gitignore

.python-version

Dockerfile

cli.py

main.py

package.json

pyproject.toml

requirements.txt

uv.lock

.actor/README.md

.actor/actor.json

quality_eval/claude_client.py

quality_eval/html_quality_evaluator.py

manual_scraping/apify_runner.py

manual_scraping/multi_actor_scraper.py

tests/examples.json

src/llmscraper/__init__.py

src/llmscraper/main.py

src/llmscraper/models.py

src/llmscraper/pipeline.py

src/llmscraper/tempCodeRunnerFile.py

src/llmscraper/evaluation/__init__.py

src/llmscraper/evaluation/html_quality_evaluator.py

src/llmscraper/generation/__init__.py

src/llmscraper/generation/script_executor.py

src/llmscraper/generation/script_generator.py

src/llmscraper/scraping/__init__.py

src/llmscraper/scraping/actor_multi_scraper.py

src/llmscraper/scraping/apify_runner.py

src/llmscraper/scraping/multi_actor_scraper.py

src/llmscraper/utils/__init__.py

src/llmscraper/utils/config.py

src/llmscraper/utils/html_utils.py

src/llmscraper/init.py

src/llmscraper/evaluation/init.py

src/llmscraper/generation/init.py

src/llmscraper/scraping/init.py

src/llmscraper/utils/init.py