1"""
2Site Researcher - Apify Actor
3Extract all images, videos, and data from any website.
4
5Features:
6- Crawl sitemap to discover all pages
7- Extract structured data (JSON-LD, meta tags)
8- Detect technology stack
9- Download all media files
10- RAG-efficient Markdown extraction for AI/LLM use
11"""
12
13import json
14import re
15import os
16import tempfile
17import traceback
18from datetime import datetime
19from typing import Optional, Tuple
20from urllib.parse import urljoin, urlparse
21
22import requests
23from apify import Actor
24from bs4 import BeautifulSoup
25from markdownify import markdownify as md
26
27
28
29TECH_SIGNATURES = {
30 "shopify": ["cdn.shopify.com", "Shopify.theme", "myshopify.com"],
31 "wordpress": ["wp-content", "wp-includes", "WordPress"],
32 "wix": ["wix.com", "wixstatic.com"],
33 "squarespace": ["squarespace.com", "sqsp.net"],
34 "webflow": ["webflow.com", "website-files.com"],
35 "react": ["__NEXT_DATA__", "reactroot", "_next/"],
36 "vue": ["__VUE__", "vue.js"],
37 "google_analytics": ["gtag(", "google-analytics.com", "googletagmanager.com"],
38 "facebook_pixel": ["fbq(", "connect.facebook.net"],
39 "stripe": ["stripe.com", "Stripe("],
40 "cloudflare": ["cloudflare.com", "cf-ray"],
41}
42
43
44def extract_markdown(soup: BeautifulSoup, url: str) -> str:
45 """
46 Extract RAG-efficient markdown from HTML.
47 Strips boilerplate (nav, footer, scripts) and converts main content to clean markdown.
48 """
49
50 from copy import deepcopy
51 soup_copy = deepcopy(soup)
52
53
54 for tag in soup_copy.find_all(['nav', 'header', 'footer', 'aside', 'script',
55 'style', 'noscript', 'iframe', 'form']):
56 tag.decompose()
57
58
59 boilerplate_patterns = ['menu', 'sidebar', 'footer', 'header', 'nav', 'cookie',
60 'popup', 'modal', 'advertisement', 'ad-', 'social', 'share']
61 for pattern in boilerplate_patterns:
62 for tag in soup_copy.find_all(class_=re.compile(pattern, re.I)):
63 tag.decompose()
64 for tag in soup_copy.find_all(id=re.compile(pattern, re.I)):
65 tag.decompose()
66
67
68 main_content = (
69 soup_copy.find('main') or
70 soup_copy.find('article') or
71 soup_copy.find(id='content') or
72 soup_copy.find(class_='content') or
73 soup_copy.find('body')
74 )
75
76 if not main_content:
77 return ""
78
79
80 try:
81 markdown_text = md(str(main_content), heading_style="ATX", strip=['img', 'a'])
82
83
84 lines = markdown_text.split('\n')
85 cleaned_lines = []
86 prev_empty = False
87 for line in lines:
88 line = line.rstrip()
89 is_empty = len(line.strip()) == 0
90 if is_empty and prev_empty:
91 continue
92 cleaned_lines.append(line)
93 prev_empty = is_empty
94
95 markdown_text = '\n'.join(cleaned_lines).strip()
96
97
98 if len(markdown_text) > 50000:
99 markdown_text = markdown_text[:50000] + "\n\n[Content truncated for size]"
100
101 return markdown_text
102 except Exception as e:
103 return ""
104
105
106async def main():
107 """Main entry point for the Apify Actor."""
108 async with Actor:
109
110 actor_input = await Actor.get_input() or {}
111
112 start_url = actor_input.get("startUrl", "")
113 deep_crawl = actor_input.get("deepCrawl", True)
114 download_media = actor_input.get("downloadMedia", True)
115 media_limit = actor_input.get("mediaLimit", 100)
116
117 if not start_url:
118 Actor.log.error("No startUrl provided!")
119 return
120
121 Actor.log.info(f"🔍 Researching: {start_url}")
122
123
124 session = requests.Session()
125 session.headers.update({
126 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
127 })
128
129
130 parsed = urlparse(start_url)
131 domain = parsed.netloc
132 base_url = f"{parsed.scheme}://{domain}"
133
134
135 research = {
136 "domain": domain,
137 "crawled_at": datetime.now().isoformat(),
138 "pages": [],
139 "sitemap_urls": [],
140 "tech_stack": {},
141 "media": {"images": [], "videos": []}
142 }
143
144
145 Actor.log.info("🗺️ Discovering sitemap...")
146 all_sitemap_urls = get_sitemap_urls(session, base_url)
147 research["sitemap_urls"] = all_sitemap_urls[:500]
148 research["sitemap_total"] = len(all_sitemap_urls)
149 Actor.log.info(f" Found {len(all_sitemap_urls)} URLs (storing {len(research['sitemap_urls'])})")
150
151
152 Actor.log.info("🔧 Detecting tech stack...")
153 research["tech_stack"] = detect_tech(session, start_url)
154
155
156 Actor.log.info("📥 Crawling main page...")
157 main_page = crawl_page(session, start_url)
158 if main_page:
159 research["pages"].append(main_page)
160
161
162 if deep_crawl and research["sitemap_urls"]:
163 pages_to_crawl = [u for u in research["sitemap_urls"] if not u.endswith('.xml')][:50]
164 Actor.log.info(f"🕷️ Deep crawling {len(pages_to_crawl)} pages...")
165
166 for i, page_url in enumerate(pages_to_crawl, 1):
167 if page_url != start_url:
168 Actor.log.info(f" [{i}/{len(pages_to_crawl)}] {page_url[:60]}...")
169 page_data = crawl_page(session, page_url)
170 if page_data:
171 research["pages"].append(page_data)
172
173
174 all_images = []
175 all_videos = []
176 seen_urls = set()
177
178 for page in research["pages"]:
179 for img in page.get("images", []):
180 if img["src"] not in seen_urls and img["src"].startswith("http"):
181 seen_urls.add(img["src"])
182 all_images.append(img)
183 for vid in page.get("videos", []):
184 if vid["src"] not in seen_urls and vid["src"].startswith("http"):
185 seen_urls.add(vid["src"])
186 all_videos.append(vid)
187
188
189 all_images = all_images[:media_limit]
190 all_videos = all_videos[:media_limit]
191
192 research["media"]["images"] = all_images
193 research["media"]["videos"] = all_videos
194
195
196 if download_media:
197 Actor.log.info(f"📥 Processing {len(all_images)} images, {len(all_videos)} videos...")
198
199 MAX_FILE_SIZE = 100 * 1024 * 1024
200 downloaded_count = 0
201
202 for i, img in enumerate(all_images, 1):
203 try:
204 filename = get_filename(img["src"]) or f"image_{i}.jpg"
205 img["filename"] = filename
206
207
208 content, size = download_file_streaming(session, img["src"], MAX_FILE_SIZE)
209 img["size_bytes"] = size
210
211 if content is not None:
212 await Actor.set_value(f"img_{filename}", content, content_type="image/jpeg")
213 img["downloaded"] = True
214 downloaded_count += 1
215 size_str = f"{size//1024}KB" if size < 1024*1024 else f"{size//1024//1024}MB"
216 Actor.log.info(f" 📷 [{i}] {filename[:40]} ({size_str})")
217 else:
218 img["downloaded"] = False
219 img["download_url"] = img["src"]
220 size_str = f"{size//1024//1024}MB" if size > 0 else "unknown"
221 Actor.log.info(f" 🔗 [{i}] {filename[:40]} - URL provided ({size_str})")
222 except Exception as e:
223 img["downloaded"] = False
224 img["download_url"] = img["src"]
225 Actor.log.warning(f" ❌ [{i}] Image error: {type(e).__name__}: {str(e)[:80]}")
226
227
228 for i, vid in enumerate(all_videos, 1):
229 try:
230 filename = get_filename(vid["src"]) or f"video_{i}.mp4"
231 vid["filename"] = filename
232
233
234 content, size = download_file_streaming(session, vid["src"], MAX_FILE_SIZE)
235 vid["size_bytes"] = size
236
237 if content is not None:
238 await Actor.set_value(f"vid_{filename}", content, content_type="video/mp4")
239 vid["downloaded"] = True
240 downloaded_count += 1
241 size_str = f"{size//1024}KB" if size < 1024*1024 else f"{size//1024//1024}MB"
242 Actor.log.info(f" 🎬 [{i}] {filename[:40]} ({size_str})")
243 else:
244 vid["downloaded"] = False
245 vid["download_url"] = vid["src"]
246 size_str = f"{size//1024//1024}MB" if size > 0 else "unknown"
247 Actor.log.info(f" 🔗 [{i}] {filename[:40]} - URL provided ({size_str})")
248 except Exception as e:
249 vid["downloaded"] = False
250 vid["download_url"] = vid["src"]
251 Actor.log.warning(f" ❌ [{i}] Video error: {type(e).__name__}: {str(e)[:80]}")
252
253 Actor.log.info(f" ✅ Downloaded {downloaded_count} files, {len(all_images) + len(all_videos) - downloaded_count} URLs provided")
254
255
256
257 for page in research["pages"]:
258
259 if len(page.get("headings", [])) > 50:
260 page["headings"] = page["headings"][:50]
261
262 if len(page.get("images", [])) > 20:
263 page["images"] = page["images"][:20]
264 if len(page.get("videos", [])) > 20:
265 page["videos"] = page["videos"][:20]
266
267 if len(page.get("description", "")) > 500:
268 page["description"] = page["description"][:500] + "..."
269
270
271 import sys
272 estimated_size = sys.getsizeof(json.dumps(research))
273 Actor.log.info(f"📊 Estimated output size: {estimated_size / 1024 / 1024:.2f} MB")
274
275
276 await Actor.push_data(research)
277
278 Actor.log.info(f"✅ Complete: {len(research['pages'])} pages, {len(all_images)} images, {len(all_videos)} videos")
279
280
281def get_sitemap_urls(session: requests.Session, base_url: str) -> list:
282 """Extract all URLs from sitemap."""
283 urls = []
284 sitemap_locations = [
285 f"{base_url}/sitemap.xml",
286 f"{base_url}/sitemap_index.xml",
287 ]
288
289 for sitemap_url in sitemap_locations:
290 try:
291 resp = session.get(sitemap_url, timeout=30)
292 if resp.status_code == 200:
293 urls.extend(parse_sitemap(session, resp.text))
294 break
295 except:
296 continue
297
298 return list(set(urls))
299
300
301def parse_sitemap(session: requests.Session, xml_content: str) -> list:
302 """Parse sitemap XML."""
303 urls = []
304 loc_pattern = r'<loc>(.*?)</loc>'
305 matches = re.findall(loc_pattern, xml_content, re.IGNORECASE)
306
307 for match in matches:
308 url = match.strip()
309 if url.endswith('.xml'):
310 try:
311 resp = session.get(url, timeout=30)
312 if resp.status_code == 200:
313 urls.extend(parse_sitemap(session, resp.text))
314 except:
315 pass
316 else:
317 urls.append(url)
318
319 return urls
320
321
322def crawl_page(session: requests.Session, url: str) -> Optional[dict]:
323 """Crawl a single page and extract data."""
324 try:
325 resp = session.get(url, timeout=30)
326 if resp.status_code != 200:
327 return None
328
329 soup = BeautifulSoup(resp.text, 'html.parser')
330
331 page = {
332 "url": url,
333 "title": "",
334 "description": "",
335 "markdown_content": "",
336 "og_data": {},
337 "json_ld": [],
338 "headings": [],
339 "images": [],
340 "videos": []
341 }
342
343
344 title_tag = soup.find('title')
345 page["title"] = title_tag.get_text(strip=True) if title_tag else ""
346
347
348 desc_tag = soup.find('meta', attrs={'name': 'description'})
349 page["description"] = desc_tag.get('content', '') if desc_tag else ""
350
351
352 for og_tag in soup.find_all('meta', property=re.compile(r'^og:')):
353 prop = og_tag.get('property', '').replace('og:', '')
354 page["og_data"][prop] = og_tag.get('content', '')
355
356
357 for script in soup.find_all('script', type='application/ld+json'):
358 try:
359 data = json.loads(script.string)
360 page["json_ld"].append(data)
361 except:
362 pass
363
364
365 for level in range(1, 4):
366 for h in soup.find_all(f'h{level}'):
367 text = h.get_text(strip=True)
368 if text:
369 page["headings"].append({"level": level, "text": text[:100]})
370
371
372 for img in soup.find_all('img'):
373 src = img.get('src') or img.get('data-src') or ''
374 if src:
375 page["images"].append({
376 "src": urljoin(url, src),
377 "alt": img.get('alt', '')
378 })
379
380
381 for video in soup.find_all('video'):
382 src = video.get('src') or ''
383 if src:
384 page["videos"].append({"src": urljoin(url, src), "type": "video"})
385 for source in video.find_all('source'):
386 src = source.get('src', '')
387 if src:
388 page["videos"].append({"src": urljoin(url, src), "type": source.get('type', 'video')})
389
390
391 for script in soup.find_all('script'):
392 if script.string:
393 for pattern in [r'https?://[^\s"]+\.mp4[^\s"]*']:
394 for match in re.findall(pattern, script.string):
395 page["videos"].append({"src": match.rstrip('"\',;'), "type": "embedded"})
396
397
398 page["markdown_content"] = extract_markdown(soup, url)
399
400 return page
401
402 except Exception as e:
403 return None
404
405
406def detect_tech(session: requests.Session, url: str) -> dict:
407 """Detect technology stack."""
408 tech = {}
409 try:
410 resp = session.get(url, timeout=30)
411 html = resp.text
412 headers = dict(resp.headers)
413
414 for tech_name, signatures in TECH_SIGNATURES.items():
415 for sig in signatures:
416 if sig.lower() in html.lower() or sig.lower() in str(headers).lower():
417 tech[tech_name] = True
418 break
419
420 if 'Server' in headers:
421 tech['server'] = headers['Server']
422 except:
423 pass
424
425 return tech
426
427
428def get_filename(url: str) -> str:
429 """Extract filename from URL."""
430 parsed = urlparse(url)
431 filename = parsed.path.split('/')[-1] or ""
432
433 filename = re.sub(r'[^\w\-_\.]', '_', filename)[:80]
434 return filename
435
436
437def download_file_streaming(session: requests.Session, url: str, max_size: int = 100*1024*1024) -> Tuple[Optional[bytes], int]:
438 """
439 Stream download a file in chunks to avoid loading large files into memory all at once.
440 Uses a temp file to accumulate chunks, then reads back the complete content.
441
442 Args:
443 session: Requests session
444 url: URL to download
445 max_size: Maximum file size to download (default 100MB)
446
447 Returns:
448 Tuple of (file_content or None, total_size)
449 Returns None if file exceeds max_size or download fails
450 """
451 tmp_path = None
452 try:
453
454 with session.get(url, stream=True, timeout=120) as resp:
455 if resp.status_code != 200:
456 return None, 0
457
458
459 content_type = resp.headers.get('content-type', '')
460 if 'text/html' in content_type.lower():
461 return None, 0
462
463 total_size = 0
464 with tempfile.NamedTemporaryFile(delete=False) as tmp:
465 tmp_path = tmp.name
466 for chunk in resp.iter_content(chunk_size=1024*1024):
467 if chunk:
468 total_size += len(chunk)
469 if total_size > max_size:
470 return None, total_size
471 tmp.write(chunk)
472
473
474 with open(tmp_path, 'rb') as f:
475 content = f.read()
476
477 return content, total_size
478
479 except Exception as e:
480 return None, 0
481 finally:
482
483 if tmp_path and os.path.exists(tmp_path):
484 try:
485 os.unlink(tmp_path)
486 except:
487 pass