1import path from 'node:path';
2
3import { Actor, log } from 'apify';
4import * as archiver from 'archiver';
5import got from 'got';
6
7await Actor.init();
8
9const input = await Actor.getInput();
10const proxyConfiguration = await Actor.createProxyConfiguration(input.proxyConfiguration);
11
12const {
13 startUrl,
14 maxCrawlDepth = 1,
15 maxConcurrency = 20,
16 imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'svg'],
17 respectRobotsTxt = true,
18 userAgent = 'Mozilla/5.0 (compatible; ApifyBot/1.0; +https://apify.com/bot)',
19 useScope = false,
20 scope = [],
21 includeSubdomains = false,
22} = input;
23
24if (!startUrl) throw new Error('startUrl is required!');
25
26
27
28const crawlerInput = {
29 startUrl,
30 maxCrawlDepth,
31 maxConcurrency: Math.min(maxConcurrency, 100),
32 imageExtensions,
33 respectRobotsTxt,
34 userAgent,
35 useScope,
36 scope,
37 includeSubdomains,
38 proxyConfiguration: input.proxyConfiguration,
39};
40
41log.info('Calling Website Image Crawler to discover image URLs...');
42log.info(JSON.stringify(crawlerInput));
43
44const crawlerRun = await Actor.call('GomorrhaDev/website-image-scraper', crawlerInput, {
45 build: 'latest',
46 waitSecs: 1200,
47});
48
49log.info(`Crawler finished. Status: ${crawlerRun.status}`);
50
51if (crawlerRun.status !== 'SUCCEEDED') {
52 throw new Error(`Crawler run failed with status: ${crawlerRun.status}`);
53}
54
55
56
57const crawlerDataset = await Actor.apifyClient.dataset(crawlerRun.defaultDatasetId);
58const { items } = await crawlerDataset.listItems();
59
60log.info(`Crawler found ${items.length} image URLs. Downloading...`);
61
62if (items.length === 0) {
63 log.info('No images found. Nothing to download.');
64 await Actor.exit();
65}
66
67
68
69const httpAgent = proxyConfiguration?.newProxyAgent?.();
70const httpsAgent = proxyConfiguration?.newProxyAgent?.();
71const agent = { http: httpAgent, https: httpsAgent };
72
73const contentTypeMap = {
74 jpg: 'image/jpeg',
75 jpeg: 'image/jpeg',
76 png: 'image/png',
77 gif: 'image/gif',
78 webp: 'image/webp',
79 bmp: 'image/bmp',
80 svg: 'image/svg+xml',
81};
82
83let downloadIndex = 0;
84let downloaded = 0;
85let failed = 0;
86const downloadedFiles = [];
87
88async function downloadOne(item) {
89 const imageUrl = item.url;
90 try {
91 const response = await got(imageUrl, {
92 timeout: { request: 30000 },
93 headers: { 'User-Agent': userAgent },
94 agent,
95 responseType: 'buffer',
96 maxRedirects: 5,
97 });
98
99 const ext = path.extname(imageUrl).split('?')[0].substring(1).toLowerCase() || 'jpg';
100 const contentType = response.headers['content-type'] || contentTypeMap[ext] || 'application/octet-stream';
101 const buffer = Buffer.from(response.rawBody);
102
103 const idx = String(++downloadIndex).padStart(5, '0');
104 const key = `image-${idx}.${ext}`;
105
106 await Actor.setValue(key, buffer, { contentType });
107
108 await Actor.pushData({
109 imageUrl,
110 sourcePage: item.sourcePage || '',
111 foundAt: item.foundAt || '',
112 key,
113 contentType,
114 sizeBytes: buffer.length,
115 });
116
117 downloaded++;
118 downloadedFiles.push({ key, buffer });
119 log.info(`[${downloaded}/${items.length}] ${key} (${(buffer.length / 1024).toFixed(1)} KB) <- ${imageUrl}`);
120 } catch (err) {
121 failed++;
122 log.warning(`Failed to download: ${imageUrl} — ${err.message}`);
123 }
124}
125
126
127const concurrency = Math.min(maxConcurrency, 50);
128const queue = [...items];
129const workers = Array.from({ length: concurrency }, async () => {
130 while (queue.length > 0) {
131 const item = queue.shift();
132 await downloadOne(item);
133 }
134});
135
136await Promise.all(workers);
137log.info(`Downloads complete. ${downloaded} downloaded, ${failed} failed, ${items.length} total.`);
138
139
140
141if (downloadedFiles.length > 0) {
142 log.info(`Creating ZIP archive from ${downloadedFiles.length} files...`);
143
144 const zipBuffers = [];
145 await new Promise((resolve, reject) => {
146 const archive = new archiver.ZipArchive({ zlib: { level: 9 } });
147 archive.on('data', (chunk) => zipBuffers.push(chunk));
148 archive.on('end', resolve);
149 archive.on('error', reject);
150
151 for (const { key, buffer } of downloadedFiles) {
152 archive.append(buffer, { name: key });
153 }
154 archive.finalize();
155 });
156
157 const zipBuffer = Buffer.concat(zipBuffers);
158 await Actor.setValue('images.zip', zipBuffer, { contentType: 'application/zip' });
159
160 log.info(`ZIP archive saved: images.zip (${(zipBuffer.length / 1024 / 1024).toFixed(1)} MB)`);
161}
162
163await Actor.exit();