1"""Apify Actor for converting web pages to clean Markdown for LLMs and RAG.
2
3This Actor scrapes web pages and converts them into clean, token-efficient Markdown
4optimized for Large Language Models (LLMs) and Retrieval Augmented Generation (RAG) systems.
5
6To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
7https://docs.apify.com/sdk/python
8"""
9
10from __future__ import annotations
11
12import re
13import hashlib
14from typing import List, Dict, Any, Optional
15from datetime import datetime
16from urllib.parse import urlparse, urljoin
17
18from apify import Actor
19from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
20from readability import Document
21from markdownify import markdownify as md
22
23
24def estimate_tokens(text: str) -> int:
25 """Estimate token count (rough approximation: 1 token ≈ 4 chars)."""
26 return len(text) // 4
27
28
29def create_chunks(markdown: str, max_chunk_size: int = 1000, overlap: int = 100) -> List[Dict[str, Any]]:
30 """
31 Split markdown content into semantic chunks while preserving context.
32
33 Args:
34 markdown: The markdown content to chunk
35 max_chunk_size: Maximum characters per chunk
36 overlap: Number of characters to overlap between chunks
37
38 Returns:
39 List of chunk dictionaries with metadata
40 """
41 chunks = []
42
43
44 sections = re.split(r'(\n#{1,6}\s+.+\n)', markdown)
45
46 current_chunk = ""
47 current_heading_context = []
48 chunk_id = 1
49
50 for i, section in enumerate(sections):
51
52 heading_match = re.match(r'\n(#{1,6})\s+(.+)\n', section)
53
54 if heading_match:
55 level = len(heading_match.group(1))
56 heading_text = heading_match.group(2).strip()
57
58
59 current_heading_context = current_heading_context[:level-1]
60 current_heading_context.append(heading_text)
61
62
63 if len(current_chunk) + len(section) < max_chunk_size:
64 current_chunk += section
65 else:
66
67 if current_chunk.strip():
68 chunks.append({
69 'chunk_id': chunk_id,
70 'content': current_chunk.strip(),
71 'heading_context': ' > '.join(current_heading_context[:-1]) if len(current_heading_context) > 1 else '',
72 'char_count': len(current_chunk),
73 'estimated_tokens': estimate_tokens(current_chunk)
74 })
75 chunk_id += 1
76
77
78 if overlap > 0 and current_chunk:
79 overlap_text = current_chunk[-overlap:]
80 current_chunk = overlap_text + section
81 else:
82 current_chunk = section
83 else:
84
85 if not section.strip():
86 continue
87
88
89 if len(current_chunk) + len(section) > max_chunk_size:
90
91 if current_chunk.strip():
92 chunks.append({
93 'chunk_id': chunk_id,
94 'content': current_chunk.strip(),
95 'heading_context': ' > '.join(current_heading_context),
96 'char_count': len(current_chunk),
97 'estimated_tokens': estimate_tokens(current_chunk)
98 })
99 chunk_id += 1
100
101
102 if overlap > 0 and current_chunk:
103 overlap_text = current_chunk[-overlap:]
104 current_chunk = overlap_text + section
105 else:
106 current_chunk = section
107 else:
108 current_chunk += section
109
110
111 if current_chunk.strip():
112 chunks.append({
113 'chunk_id': chunk_id,
114 'content': current_chunk.strip(),
115 'heading_context': ' > '.join(current_heading_context),
116 'char_count': len(current_chunk),
117 'estimated_tokens': estimate_tokens(current_chunk)
118 })
119
120 return chunks
121
122
123def extract_metadata(soup, url: str) -> Dict[str, Any]:
124 """Extract metadata from the page."""
125 metadata = {
126 'url': url,
127 'domain': urlparse(url).netloc,
128 'scraped_at': datetime.utcnow().isoformat() + 'Z',
129 }
130
131
132 author = None
133 author_meta = soup.find('meta', {'name': re.compile(r'author', re.I)})
134 if author_meta:
135 author = author_meta.get('content')
136 if not author:
137
138 json_ld = soup.find('script', {'type': 'application/ld+json'})
139 if json_ld:
140 try:
141 import json
142 data = json.loads(json_ld.string)
143 if isinstance(data, dict):
144 author = data.get('author', {}).get('name')
145 except:
146 pass
147 metadata['author'] = author
148
149
150 publish_date = None
151 date_meta = soup.find('meta', {'property': 'article:published_time'}) or \
152 soup.find('meta', {'name': re.compile(r'publish|date', re.I)})
153 if date_meta:
154 publish_date = date_meta.get('content')
155 metadata['publish_date'] = publish_date
156
157
158 modified_date = None
159 modified_meta = soup.find('meta', {'property': 'article:modified_time'}) or \
160 soup.find('meta', {'name': 'last-modified'})
161 if modified_meta:
162 modified_date = modified_meta.get('content')
163 metadata['last_modified'] = modified_date
164
165
166 lang = soup.find('html').get('lang', 'en') if soup.find('html') else 'en'
167 metadata['language'] = lang[:2]
168
169
170 keywords = []
171 keywords_meta = soup.find('meta', {'name': re.compile(r'keywords', re.I)})
172 if keywords_meta:
173 keywords_content = keywords_meta.get('content', '')
174 keywords = [k.strip() for k in keywords_content.split(',') if k.strip()]
175 metadata['keywords'] = keywords[:10]
176
177
178 desc_meta = soup.find('meta', {'name': 'description'}) or \
179 soup.find('meta', {'property': 'og:description'})
180 metadata['description'] = desc_meta.get('content') if desc_meta else None
181
182
183 content_type = 'general'
184 if '/blog/' in url or '/article/' in url or '/post/' in url:
185 content_type = 'blog'
186 elif '/docs/' in url or '/documentation/' in url:
187 content_type = 'documentation'
188 elif '/product/' in url or '/shop/' in url:
189 content_type = 'product'
190 elif '/wiki/' in url:
191 content_type = 'wiki'
192 metadata['content_type'] = content_type
193
194 return metadata
195
196
197def resolve_relative_links(markdown: str, base_url: str) -> str:
198 """Convert relative URLs to absolute URLs in markdown links."""
199
200 def replace_link(match):
201 text = match.group(1)
202 url = match.group(2)
203
204
205 if url.startswith(('http://', 'https://', '#', 'mailto:', 'tel:')):
206 return match.group(0)
207
208
209 absolute_url = urljoin(base_url, url)
210 return f'[{text}]({absolute_url})'
211
212
213 markdown = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_link, markdown)
214
215 return markdown
216
217
218def extract_code_blocks(markdown: str) -> List[Dict[str, str]]:
219 """Extract code blocks from markdown."""
220 code_blocks = []
221
222
223 pattern = r'```(\w+)?\n(.*?)```'
224 matches = re.findall(pattern, markdown, re.DOTALL)
225
226 for lang, code in matches:
227 code_blocks.append({
228 'language': lang if lang else 'text',
229 'code': code.strip(),
230 'lines': len(code.strip().split('\n'))
231 })
232
233
234 inline_pattern = r'`([^`]+)`'
235 inline_matches = re.findall(inline_pattern, markdown)
236
237 return {
238 'fenced_blocks': code_blocks,
239 'inline_code_count': len(inline_matches),
240 'has_code': len(code_blocks) > 0 or len(inline_matches) > 0
241 }
242
243
244def calculate_quality_metrics(markdown: str, html_length: int) -> Dict[str, Any]:
245 """Calculate content quality metrics."""
246
247 text_density = len(markdown) / max(html_length, 1)
248
249
250 lines = [l.strip() for l in markdown.split('\n') if l.strip()]
251 paragraphs = [l for l in lines if not l.startswith(('#', '-', '*', '>'))]
252 paragraph_count = len([p for p in paragraphs if len(p) > 50])
253
254
255 sentences = re.split(r'[.!?]+', markdown)
256 sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
257 avg_sentence_length = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)
258
259
260 word_count = len(markdown.split())
261 reading_time_minutes = max(1, round(word_count / 200))
262
263
264 has_lists = bool(re.search(r'^\s*[-*+]\s', markdown, re.MULTILINE))
265 has_headings = bool(re.search(r'^#{1,6}\s', markdown, re.MULTILINE))
266 has_links = bool(re.search(r'\[.+\]\(.+\)', markdown))
267
268 return {
269 'text_density': round(text_density, 2),
270 'paragraph_count': paragraph_count,
271 'word_count': word_count,
272 'sentence_count': len(sentences),
273 'avg_sentence_length': round(avg_sentence_length, 1),
274 'reading_time_minutes': reading_time_minutes,
275 'has_lists': has_lists,
276 'has_headings': has_headings,
277 'has_links': has_links,
278 'structure_score': sum([has_lists, has_headings, has_links]) / 3.0
279 }
280
281
282def generate_content_hashes(markdown: str) -> Dict[str, str]:
283 """Generate hashes for deduplication."""
284
285 content_hash = hashlib.sha256(markdown.encode('utf-8')).hexdigest()
286
287
288
289 normalized = re.sub(r'\s+', ' ', markdown.lower().strip())
290 similarity_hash = hashlib.sha256(normalized.encode('utf-8')).hexdigest()[:16]
291
292 return {
293 'content_hash': content_hash,
294 'similarity_hash': similarity_hash
295 }
296
297
298async def main() -> None:
299 """Define the main entry point for the Apify Actor.
300
301 This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution.
302 Asynchronous execution is required for communication with Apify platform.
303 """
304 async with Actor:
305
306 actor_input = await Actor.get_input() or {}
307
308 start_urls = [
309 url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])
310 ]
311 include_links = actor_input.get('include_links', True)
312
313
314 if not start_urls:
315 Actor.log.info('No URLs provided in start_urls, exiting...')
316 await Actor.exit()
317
318 Actor.log.info(f'Processing {len(start_urls)} URLs')
319 Actor.log.info(f'Include links: {include_links}')
320
321
322 crawler = BeautifulSoupCrawler(
323 max_requests_per_crawl=len(start_urls),
324 )
325
326
327 @crawler.router.default_handler
328 async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
329 url = context.request.url
330 Actor.log.info(f'Scraping {url}...')
331 Actor.log.info(f'Starting content extraction for {url}')
332
333 try:
334 soup = context.soup
335 Actor.log.info(f'BeautifulSoup parsed, HTML length: {len(str(soup))} chars')
336
337
338 original_html_length = len(str(soup))
339
340
341 title = soup.title.string if soup.title else 'No Title'
342
343
344 metadata = extract_metadata(soup, url)
345
346
347 for element in soup(['script', 'style', 'nav', 'header', 'footer',
348 'iframe', 'noscript', 'svg', 'button', 'form']):
349 element.decompose()
350
351
352 noise_selectors = [
353 '[class*="cookie"]', '[class*="banner"]', '[class*="popup"]',
354 '[class*="modal"]', '[id*="cookie"]', '[class*="ad-"]',
355 '[class*="advertisement"]', '[aria-label*="cookie"]'
356 ]
357 for selector in noise_selectors:
358 for element in soup.select(selector):
359 element.decompose()
360
361
362 use_readability = True
363 try:
364 html_content = str(soup)
365 doc = Document(html_content)
366 clean_html = doc.summary()
367
368
369 markdown_content = md(
370 clean_html,
371 heading_style='ATX',
372 bullets='-',
373 strip=['script', 'style', 'img'],
374 convert=None if include_links else ['a'],
375 escape_asterisks=False,
376 escape_underscores=False
377 )
378
379
380 if len(markdown_content.strip()) < 300:
381 raise ValueError("Content too short, using full body")
382
383 except Exception:
384
385 use_readability = False
386 Actor.log.info(f'Using full body content for {url}')
387
388
389 main_content = soup.find('main') or soup.find('article') or soup.body or soup
390
391 markdown_content = md(
392 str(main_content),
393 heading_style='ATX',
394 bullets='-',
395 strip=['script', 'style', 'img'],
396 convert=None if include_links else ['a'],
397 escape_asterisks=False,
398 escape_underscores=False
399 )
400
401
402 lines = []
403 prev_empty = False
404
405 for line in markdown_content.split('\n'):
406
407 line = line.strip()
408
409
410 if len(line) < 3 and not line.startswith('#'):
411 continue
412
413
414 if line and all(c in '.-_*[](){}|\\/' for c in line):
415 continue
416
417
418 if not line:
419 if not prev_empty:
420 lines.append('')
421 prev_empty = True
422 else:
423 lines.append(line)
424 prev_empty = False
425
426 markdown_content = '\n'.join(lines).strip()
427
428
429 markdown_content = resolve_relative_links(markdown_content, url)
430
431
432 code_info = extract_code_blocks(markdown_content)
433
434
435 quality_metrics = calculate_quality_metrics(markdown_content, original_html_length)
436
437
438 hashes = generate_content_hashes(markdown_content)
439
440
441 chunks = create_chunks(markdown_content, max_chunk_size=1000, overlap=100)
442
443
444
445 MIN_CONTENT_LENGTH = 100
446
447 Actor.log.info(f'Extracted markdown length: {len(markdown_content)} chars')
448
449 if len(markdown_content.strip()) < MIN_CONTENT_LENGTH:
450 Actor.log.warning(
451 f'⚠️ Skipping {url}: Content too short ({len(markdown_content)} chars). '
452 f'Minimum required: {MIN_CONTENT_LENGTH} chars. User not charged.'
453 )
454 return
455
456
457 error_indicators = [
458 'enable javascript',
459 'javascript is disabled',
460 'please enable cookies',
461 'access denied',
462 '403 forbidden',
463 '404 not found',
464 'page not found'
465 ]
466 content_lower = markdown_content.lower()
467 if any(indicator in content_lower for indicator in error_indicators):
468 if len(markdown_content) < 500:
469 Actor.log.warning(
470 f'⚠️ Skipping {url}: Appears to be an error page or requires JavaScript. User not charged.'
471 )
472 return
473
474
475 Actor.log.info(f'✅ Content passed quality checks for {url}')
476
477
478 final_content = f"**Source:** {url}\n\n---\n\n{markdown_content}"
479
480
481 data = {
482 'url': url,
483 'title': title,
484 'markdown_content': final_content,
485 'chunks': chunks,
486 'metadata': metadata,
487 'code_blocks': code_info,
488 'quality_metrics': quality_metrics,
489 'hashes': hashes,
490 'total_chunks': len(chunks),
491 'total_chars': len(markdown_content),
492 'estimated_tokens': estimate_tokens(markdown_content)
493 }
494
495 method = "Readability" if use_readability else "Full Body"
496 Actor.log.info(f'Successfully converted [{method}]: {title} ({len(final_content)} chars, {len(chunks)} chunks)')
497
498
499 await context.push_data(data)
500
501 except Exception as e:
502 Actor.log.error(f'Error processing {url}: {str(e)}')
503
504
505 await crawler.run(start_urls)
506
507
508
509if __name__ == '__main__':
510 import asyncio
511 asyncio.run(main())