1import { Actor } from "apify";
2import { PlaywrightCrawler, Dataset } from "crawlee";
3
4interface Input {
5 startUrls: Array<{ url: string }>;
6 maxPages: number;
7 maxDepth: number;
8 outputFormat: "markdown" | "text" | "html";
9 removeNavigation: boolean;
10 includeMetadata: boolean;
11}
12
13interface PageResult {
14 url: string;
15 title: string;
16 content: string;
17 metadata: {
18 description: string | null;
19 ogTitle: string | null;
20 ogDescription: string | null;
21 ogImage: string | null;
22 language: string | null;
23 };
24 links: string[];
25 wordCount: number;
26 crawledAt: string;
27}
28
29await Actor.init();
30
31const input = (await Actor.getInput()) as Input;
32
33if (!input.startUrls || input.startUrls.length === 0) {
34 throw new Error("At least one start URL is required");
35}
36
37const maxPages = input.maxPages ?? 10;
38const maxDepth = input.maxDepth ?? 2;
39const outputFormat = input.outputFormat ?? "markdown";
40const removeNav = input.removeNavigation ?? true;
41const includeMetadata = input.includeMetadata ?? true;
42
43let pagesProcessed = 0;
44
45const crawler = new PlaywrightCrawler({
46 maxRequestsPerCrawl: maxPages,
47 async requestHandler({ request, page, enqueueLinks, log }) {
48 if (pagesProcessed >= maxPages) return;
49
50 const url = request.loadedUrl || request.url;
51 log.info(`Processing: ${url}`);
52
53
54 const title = await page.title();
55
56
57 if (removeNav) {
58 await page.evaluate(() => {
59 const selectors = [
60 'nav', 'header', 'footer', '.nav', '.navbar', '.menu',
61 '.sidebar', '.cookie-banner', '.popup', '.modal',
62 '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
63 '.advertisement', '.ad', '#cookie-consent', '.social-share',
64 ];
65 for (const sel of selectors) {
66 document.querySelectorAll(sel).forEach(el => el.remove());
67 }
68 });
69 }
70
71
72 let content: string;
73 if (outputFormat === "html") {
74
75 content = await page.evaluate(() => {
76 const main = document.querySelector('main, article, [role="main"], .content, #content');
77 return (main || document.body).innerHTML;
78 });
79 } else {
80
81 const rawText = await page.evaluate(() => {
82 const main = document.querySelector('main, article, [role="main"], .content, #content');
83 return ((main || document.body) as HTMLElement).innerText;
84 });
85
86 if (outputFormat === "markdown") {
87
88
89 content = await page.evaluate(() => {
90 const main = document.querySelector('main, article, [role="main"], .content, #content') || document.body;
91 const lines: string[] = [];
92
93 const processNode = (node: Node): void => {
94 if (node.nodeType === Node.TEXT_NODE) {
95 const text = (node.textContent || '').trim();
96 if (text) lines.push(text);
97 return;
98 }
99
100 if (node.nodeType !== Node.ELEMENT_NODE) return;
101 const el = node as Element;
102 const tag = el.tagName.toLowerCase();
103
104 if (['script', 'style', 'noscript'].includes(tag)) return;
105
106 if (tag === 'h1') { lines.push(`\n# ${el.textContent?.trim()}\n`); return; }
107 if (tag === 'h2') { lines.push(`\n## ${el.textContent?.trim()}\n`); return; }
108 if (tag === 'h3') { lines.push(`\n### ${el.textContent?.trim()}\n`); return; }
109 if (tag === 'h4') { lines.push(`\n#### ${el.textContent?.trim()}\n`); return; }
110 if (tag === 'p') { lines.push(`\n${el.textContent?.trim()}\n`); return; }
111 if (tag === 'li') { lines.push(`- ${el.textContent?.trim()}`); return; }
112 if (tag === 'a') {
113 const href = el.getAttribute('href');
114 const text = el.textContent?.trim();
115 if (href && text) { lines.push(`[${text}](${href})`); return; }
116 }
117 if (tag === 'pre' || tag === 'code') {
118 lines.push(`\n\`\`\`\n${el.textContent?.trim()}\n\`\`\`\n`);
119 return;
120 }
121 if (tag === 'br') { lines.push(''); return; }
122 if (tag === 'hr') { lines.push('\n---\n'); return; }
123
124
125 for (const child of Array.from(el.childNodes)) {
126 processNode(child);
127 }
128 };
129
130 processNode(main);
131 return lines.join('\n').replace(/\n{3,}/g, '\n\n').trim();
132 });
133 } else {
134 content = rawText.replace(/\n{3,}/g, '\n\n').trim();
135 }
136 }
137
138
139 let metadata: PageResult["metadata"] = {
140 description: null, ogTitle: null, ogDescription: null, ogImage: null, language: null
141 };
142
143 if (includeMetadata) {
144 metadata = await page.evaluate(() => ({
145 description: document.querySelector('meta[name="description"]')?.getAttribute('content') || null,
146 ogTitle: document.querySelector('meta[property="og:title"]')?.getAttribute('content') || null,
147 ogDescription: document.querySelector('meta[property="og:description"]')?.getAttribute('content') || null,
148 ogImage: document.querySelector('meta[property="og:image"]')?.getAttribute('content') || null,
149 language: document.documentElement.lang || null,
150 }));
151 }
152
153
154 const links = await page.evaluate(() =>
155 Array.from(document.querySelectorAll('a[href]'))
156 .map(a => (a as HTMLAnchorElement).href)
157 .filter(href => href.startsWith('http'))
158 .slice(0, 100)
159 );
160
161
162 const wordCount = content.split(/\s+/).filter(w => w.length > 0).length;
163
164 const result: PageResult = {
165 url,
166 title,
167 content,
168 metadata,
169 links,
170 wordCount,
171 crawledAt: new Date().toISOString(),
172 };
173
174 await Dataset.pushData(result);
175 pagesProcessed++;
176
177
178 const currentDepth = (request.userData?.depth as number) ?? 0;
179 if (currentDepth < maxDepth) {
180 await enqueueLinks({
181 userData: { depth: currentDepth + 1 },
182 transformRequestFunction: (req) => {
183 req.userData = { ...req.userData, depth: currentDepth + 1 };
184 return req;
185 },
186 });
187 }
188
189
190 try {
191 await (Actor as any).charge?.("page-crawled", 1);
192 } catch {
193
194 }
195
196 log.info(`Done: ${url} (${wordCount} words)`);
197 },
198 failedRequestHandler({ request, log }) {
199 log.error(`Failed: ${request.url}`);
200 },
201});
202
203
204const startRequests = input.startUrls.map(item => ({
205 url: item.url,
206 userData: { depth: 0 },
207}));
208
209await crawler.run(startRequests);
210
211console.log(`Crawling complete. Pages processed: ${pagesProcessed}`);
212
213await Actor.exit();