1import path from 'node:path';
2
3import { Actor, log } from 'apify';
4import * as cheerio from 'cheerio';
5import got from 'got';
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29export function createCrawler({ config, requestQueue, processedUrls, robotsChecker }) {
30 const {
31 userAgent, httpAgent, httpsAgent, maxCrawlDepth,
32 imageExtensionsSet, useScope, scope, includeSubdomains,
33 } = config;
34
35
36
37 function isUrlAllowedByScope(targetUrl) {
38 if (!useScope) return true;
39 try {
40 const { hostname } = new URL(targetUrl);
41 return scope.some(domain =>
42 includeSubdomains
43 ? hostname === domain || hostname.endsWith(`.${domain}`)
44 : hostname === domain,
45 );
46 } catch {
47 return false;
48 }
49 }
50
51 function extractImagesFromPage($, baseUrl) {
52 const raw = [];
53
54 $('img[src]').each((_, el) => {
55 const src = $(el).attr('src');
56 if (src) raw.push(src);
57 });
58
59 $('[style]').each((_, el) => {
60 const style = $(el).attr('style') || '';
61 const m = style.match(/background-image:\s*url\(["']?([^"')]+)["']?\)/i);
62 if (m) raw.push(m[1]);
63 });
64
65
66 const seen = new Set();
67 for (const src of raw) {
68 try {
69 const absolute = new URL(src, baseUrl).href;
70 const ext = path.extname(absolute).substring(1).toLowerCase();
71 if (imageExtensionsSet.has(ext) && !seen.has(absolute)) {
72 seen.add(absolute);
73 }
74 } catch { }
75 }
76 return [...seen];
77 }
78
79 function extractLinksFromPage($, baseUrl) {
80 const links = [];
81 const linkedImages = [];
82
83 $('a[href]').each((_, el) => {
84 const href = $(el).attr('href');
85 if (!href) return;
86 try {
87 const absoluteUrl = new URL(href, baseUrl).href;
88 links.push(absoluteUrl);
89 const ext = path.extname(absoluteUrl).substring(1).toLowerCase();
90 if (imageExtensionsSet.has(ext)) {
91 linkedImages.push(absoluteUrl);
92 }
93 } catch { }
94 });
95
96 return { links: [...new Set(links)], linkedImages };
97 }
98
99
100
101 async function crawlPage(request) {
102 const { url, userData: { depth } } = request;
103
104
105 if (!/^https?:/.test(url)) {
106 log.info(`Skipping non-HTTP URL: ${url}`);
107 return;
108 }
109
110 const parsedUrl = new URL(url);
111
112
113 const guardian = await robotsChecker.getGuard(url);
114 if (guardian && !guardian.isAllowed(userAgent, parsedUrl.pathname)) {
115 log.warning(`Blocked by robots.txt: ${url}`);
116 return;
117 }
118
119
120 if (processedUrls.has(url)) {
121 log.info(`Already processed ${url}, skipping.`);
122 return;
123 }
124 processedUrls.add(url);
125
126 log.info(`Processing ${url} (depth: ${depth})`);
127
128
129 const agent = { http: httpAgent, https: httpsAgent };
130 let body;
131 try {
132 const response = await got(url, {
133 timeout: { request: 10000 },
134 headers: { 'User-Agent': userAgent },
135 agent,
136 });
137 const contentType = response.headers['content-type'] || '';
138 if (!contentType.includes('text/html')) {
139 log.info(`Skipping non-HTML content at ${url}`);
140 return;
141 }
142 body = response.body;
143 } catch (error) {
144 log.error(`Failed to download ${url}: ${error.message}`);
145 return;
146 }
147
148 const $ = cheerio.load(body);
149 const now = new Date().toISOString();
150
151
152 const images = extractImagesFromPage($, url);
153 for (const imgUrl of images) {
154 await Actor.pushData({ url: imgUrl, sourcePage: url, foundAt: now });
155 }
156 log.info(`Found ${images.length} images on ${url}`);
157
158
159 if (depth < maxCrawlDepth) {
160 const { links, linkedImages } = extractLinksFromPage($, url);
161
162
163 for (const imgUrl of linkedImages) {
164 await Actor.pushData({ url: imgUrl, sourcePage: url, foundAt: now });
165 }
166
167 for (const link of links) {
168 if (!/^https?:/.test(link)) continue;
169 if (!processedUrls.has(link) && isUrlAllowedByScope(link)) {
170 await requestQueue.addRequest({
171 url: link,
172 userData: { depth: depth + 1 },
173 });
174 }
175 }
176
177 log.info(`Enqueued ${links.length} links from ${url}`);
178 }
179 }
180
181 return { crawlPage };
182}