1import path from 'node:path';
2
3import { Actor, log } from 'apify';
4import * as cheerio from 'cheerio';
5import got from 'got';
6
7await Actor.init();
8
9const input = await Actor.getInput();
10const {
11 startUrl,
12 maxCrawlDepth = 1,
13 maxConcurrency = 10,
14 imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'svg']
15} = input;
16
17if (!startUrl) throw new Error('startUrl is required!');
18
19log.info('Starting Image URL Scraper', { startUrl, maxCrawlDepth, maxConcurrency });
20
21const requestQueue = await Actor.openRequestQueue();
22await requestQueue.addRequest({ url: startUrl, userData: { depth: 0 } });
23
24const processedUrls = new Set();
25const foundImages = [];
26
27const crawlPage = async (request) => {
28 const { url, userData: { depth } } = request;
29
30 log.info(`Processing ${url} (depth: ${depth})`);
31
32 if (processedUrls.has(url)) {
33 log.info(`Already processed ${url}, skipping.`);
34 return;
35 }
36 processedUrls.add(url);
37
38 let body;
39 try {
40 const response = await got(url, { timeout: { request: 10000 } });
41 body = response.body;
42 } catch (error) {
43 log.error(`Failed to download ${url}: ${error.message}`);
44 return;
45 }
46
47 const $ = cheerio.load(body);
48
49
50 const imagesOnPage = [];
51
52
53 $('img[src]').each((_, el) => {
54 const src = $(el).attr('src');
55 if (src) imagesOnPage.push(src);
56 });
57
58
59 $('[style]').each((_, el) => {
60 const style = $(el).attr('style') || '';
61 const match = style.match(/background-image:\s*url\(["']?([^"')]+)["']?\)/i);
62 if (match) imagesOnPage.push(match[1]);
63 });
64
65
66 const filteredImages = imagesOnPage
67 .map(src => {
68
69 try {
70 return new URL(src, url).href;
71 } catch {
72 return null;
73 }
74 })
75 .filter(src => src && imageExtensions.includes(path.extname(src).substring(1).toLowerCase()));
76
77
78 const uniqueImages = [...new Set(filteredImages)];
79
80 log.info(`Found ${uniqueImages.length} images on ${url}`);
81
82
83 for (const imgUrl of uniqueImages) {
84 foundImages.push({
85 url: imgUrl,
86 sourcePage: url,
87 detectedAt: new Date().toISOString(),
88 });
89 }
90
91
92 if (depth < maxCrawlDepth) {
93 const links = [];
94 $('a[href]').each((_, el) => {
95 const href = $(el).attr('href');
96 if (!href) return;
97 try {
98 const absoluteUrl = new URL(href, url).href;
99 links.push(absoluteUrl);
100 } catch {
101
102 }
103 });
104
105 const uniqueLinks = [...new Set(links)];
106
107 for (const link of uniqueLinks) {
108 if (!processedUrls.has(link)) {
109 await requestQueue.addRequest({
110 url: link,
111 userData: { depth: depth + 1 }
112 });
113 }
114 }
115
116 log.info(`Enqueued ${uniqueLinks.length} links from ${url}`);
117 }
118};
119
120
121const concurrency = Math.min(maxConcurrency, 20);
122const promises = [];
123
124for (let i = 0; i < concurrency; i++) {
125 promises.push((async () => {
126 while (true) {
127 const request = await requestQueue.fetchNextRequest();
128 if (!request) break;
129
130 try {
131 await crawlPage(request);
132 await requestQueue.markRequestHandled(request);
133 } catch (err) {
134 log.error(`Error crawling ${request.url}: ${err.message}`);
135 await requestQueue.markRequestFailed(request);
136 }
137 }
138 })());
139}
140
141await Promise.all(promises);
142
143
144await Actor.pushData({
145 totalUrlsProcessed: processedUrls.size,
146 totalImagesFound: foundImages.length,
147 images: foundImages,
148});
149
150log.info('Crawl finished.');
151await Actor.exit();