1import { Actor } from 'apify';
2import { CheerioCrawler, log } from 'crawlee';
3import { Readability } from '@mozilla/readability';
4import { JSDOM } from 'jsdom';
5
6interface InputSchema {
7 startUrls: { url: string }[];
8 proxyConfiguration?: any;
9}
10
11await Actor.init();
12
13const input = await Actor.getInput<InputSchema>();
14if (!input || !input.startUrls || input.startUrls.length === 0) {
15 throw new Error('Girdi bulunamadı veya startUrls boş!');
16}
17
18const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);
19
20const crawler = new CheerioCrawler({
21 proxyConfiguration,
22 maxRequestRetries: 3,
23 requestHandlerTimeoutSecs: 30,
24
25
26 preNavigationHooks: [
27 (_crawlingContext, requestOptions) => {
28 requestOptions.headers = {
29 ...requestOptions.headers,
30 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:120.0) Gecko/20100101 Firefox/120.0',
31 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
32 'Accept-Language': 'en-US,en;q=0.5',
33 'Upgrade-Insecure-Requests': '1',
34 };
35 },
36 ],
37
38 async requestHandler({ request, body, log }) {
39 log.info(`Makale işleniyor: ${request.url}`);
40
41 try {
42
43 const doc = new JSDOM(body, { url: request.url });
44 const reader = new Readability(doc.window.document);
45 const article = reader.parse();
46
47 if (article && article.textContent) {
48
49 const cleanText = article.textContent.replace(/\n\s*\n/g, '\n\n').trim();
50
51
52 const wordCount = cleanText.split(/\s+/).length;
53 const readingTimeMins = Math.ceil(wordCount / 200);
54
55 const result = {
56 url: request.url,
57 title: article.title || null,
58 author: article.byline || null,
59 publishedTime: article.publishedTime || null,
60 siteName: article.siteName || null,
61 textContent: cleanText,
62 readingTimeMins,
63 scrapedAt: new Date().toISOString()
64 };
65
66 await Actor.pushData(result);
67 log.info(`✅ Makale başarıyla çıkarıldı: ${article.title}`);
68 } else {
69 log.warning(`⚠️ Bu sayfadan makale çıkarılamadı: ${request.url}`);
70 }
71 } catch (error: any) {
72 log.error(`Hata oluştu (${request.url}): ${error.message}`);
73 }
74 },
75
76 failedRequestHandler({ request, log }) {
77 log.error(`İstek başarısız oldu: ${request.url}`);
78 },
79});
80
81await crawler.run(input.startUrls.map(req => req.url));
82
83log.info('Blog Tarama işlemi başarıyla tamamlandı!');
84await Actor.exit();