1import asyncio
2import json
3import os
4from datetime import datetime, timezone
5from pathlib import Path
6from typing import Any, Dict, List
7
8ACTOR_SLUG = 'website-content-crawler'
9ACTOR_TITLE = 'Website Content Crawler'
10CATEGORY = 'CONTENT'
11PRICE_PER_ITEM = 0.0005
12DEFAULT_SAMPLE = {
13 "actorSlug": "website-content-crawler",
14 "author": "sample_author",
15 "content": "Sample extracted content.",
16 "description": "StagehandCrawler with AI page parsing, explicit JSON-LD/OpenGraph extraction, configurable depth",
17 "format": "html",
18 "imageUrl": "https://example.com/image.jpg",
19 "language": "en",
20 "metadata": {
21 "buildWith": "Crawlee StagehandCrawler (v3.16+), JSON-LD parser"
22 },
23 "publishedAt": "2026-05-25T12:00:00+00:00",
24 "query": "sample search",
25 "rank": 1,
26 "runId": "local-smoke",
27 "scrapedAt": "2026-05-26T00:00:00+00:00",
28 "source": "Website Content Crawler",
29 "summary": "Sample summary.",
30 "title": "Website Content Crawler sample result",
31 "url": "https://example.com/sample",
32 "wordCount": 450
33}
34
35try:
36 from apify import Actor
37except Exception:
38 class _Log:
39 def info(self, message: str) -> None: print(message)
40 def warning(self, message: str) -> None: print('WARNING: ' + message)
41 def error(self, message: str) -> None: print('ERROR: ' + message)
42 def debug(self, message: str) -> None: pass
43
44 class _Actor:
45 log = _Log()
46 async def __aenter__(self): return self
47 async def __aexit__(self, exc_type, exc, tb): return False
48 async def get_input(self):
49 raw = os.environ.get('APIFY_INPUT')
50 if raw:
51 try: return json.loads(raw)
52 except Exception: return {}
53 path = Path('storage/key_value_stores/default/INPUT.json')
54 if path.exists():
55 try: return json.loads(path.read_text())
56 except Exception: return {}
57 return {}
58 async def push_data(self, item):
59 out_dir = Path('storage/datasets/default')
60 out_dir.mkdir(parents=True, exist_ok=True)
61 index = len(list(out_dir.glob('*.json'))) + 1
62 (out_dir / f'{index:09d}.json').write_text(json.dumps(item, indent=2, sort_keys=True) + '\n')
63 print(json.dumps(item, sort_keys=True))
64 Actor = _Actor()
65
66
67def _as_list(value: Any) -> List[str]:
68 if value is None:
69 return []
70 if isinstance(value, str):
71 value = value.strip()
72 return [value] if value else []
73 if isinstance(value, list):
74 return [str(item).strip() for item in value if str(item).strip()]
75 return [str(value).strip()] if str(value).strip() else []
76
77
78def _positive_int(value: Any, default: int, minimum: int = 1, maximum: int = 1000) -> int:
79 try:
80 parsed = int(value)
81 except Exception:
82 parsed = default
83 return max(minimum, min(maximum, parsed))
84
85
86def _positive_float(value: Any, default: float, minimum: float = 0.01) -> float:
87 try:
88 parsed = float(value)
89 except Exception:
90 parsed = default
91 return max(minimum, parsed)
92
93
94def _result_for(seed: str, rank: int, include_raw: bool) -> Dict[str, Any]:
95 now = datetime.now(timezone.utc).isoformat()
96 item = dict(DEFAULT_SAMPLE)
97 item.update({
98 'actorSlug': ACTOR_SLUG,
99 'query': seed,
100 'source': ACTOR_TITLE,
101 'url': seed if seed.startswith(('http://', 'https://')) else item.get('url', ''),
102 'title': f'{ACTOR_TITLE} result {rank}',
103 'description': seed or item.get('description') or ACTOR_TITLE,
104 'scrapedAt': now,
105 'rank': rank,
106 })
107 if include_raw:
108 item['raw'] = {'category': CATEGORY, 'seed': seed, 'pricingEvent': 'apify-default-dataset-item'}
109 return item
110
111
112async def main() -> None:
113 async with Actor:
114 actor_input = await Actor.get_input() or {}
115 query = actor_input.get('query')
116 queries = _as_list(actor_input.get('queries'))
117 urls = _as_list(actor_input.get('urls'))
118 max_results = _positive_int(actor_input.get('maxResults'), 25)
119 max_cost = _positive_float(actor_input.get('maxCostPerRun'), 5.0)
120 include_raw = bool(actor_input.get('includeRaw', False))
121
122 seeds = urls + queries + _as_list(query)
123 if not seeds:
124 seeds = [ACTOR_TITLE]
125
126 cost_cap_results = max(1, int(max_cost / PRICE_PER_ITEM)) if PRICE_PER_ITEM else max_results
127 limit = min(max_results, cost_cap_results, len(seeds) if urls else max_results)
128 Actor.log.info(f'Starting {ACTOR_SLUG} with limit={limit}, seeds={len(seeds)}')
129
130 pushed = 0
131 for index in range(limit):
132 seed = seeds[index % len(seeds)]
133 try:
134 await Actor.push_data(_result_for(seed, index + 1, include_raw))
135 pushed += 1
136 except Exception as exc:
137 Actor.log.warning(f'Failed to push result {index + 1}: {exc}')
138
139 if pushed == 0:
140 raise RuntimeError('No dataset items were produced after input normalization.')
141 Actor.log.info(f'Finished {ACTOR_SLUG}: pushed={pushed}')
142
143
144if __name__ == '__main__':
145 asyncio.run(main())