Website Image Scraper avatar
Website Image Scraper

Pricing

$2.00 / 1,000 images

Go to Store
Website Image Scraper

Website Image Scraper

Developed by

F. Gutz

F. Gutz

Maintained by Community

Website Image Scraper is a fast, lightweight tool that crawls websites to extract image URLs (jpg, png, svg) without downloading files or using browsers. It supports recursive crawling, respects robots.txt, and efficiently collects image links for analysis or monitoring or a later download.

0.0 (0)

Pricing

$2.00 / 1,000 images

0

Total users

2

Monthly users

2

Runs succeeded

>99%

Last modified

3 days ago

.dockerignore

# configurations
.idea
.vscode
.zed
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf
quote_type = single

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
.vscode
.zed
node_modules
storage
# Added by Apify CLI
.venv

.prettierrc

{
"printWidth": 120,
"tabWidth": 4,
"singleQuote": true
}

Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:22
# Check preinstalled packages
RUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Create and run as a non-root user.
RUN adduser -h /home/apify -D apify && \
chown -R apify:apify ./
USER apify
# Run the image.
CMD npm start --silent

eslint.config.mjs

1import prettier from 'eslint-config-prettier';
2
3import apify from '@apify/eslint-config/js.js';
4
5// eslint-disable-next-line import/no-default-export
6export default [{ ignores: ['**/dist'] }, ...apify, prettier];

package.json

{
"name": "website-image-crawler",
"version": "0.0.1",
"type": "module",
"description": "This is a boilerplate of an Apify Actor.",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.4.2",
"cheerio": "^1.1.0",
"got": "^14.4.7"
},
"devDependencies": {
"@apify/eslint-config": "^1.0.0",
"eslint": "^9.29.0",
"eslint-config-prettier": "^10.1.5",
"prettier": "^3.5.3"
},
"scripts": {
"start": "node ./src/main.js",
"format": "prettier --write .",
"format:check": "prettier --check .",
"lint": "eslint",
"lint:fix": "eslint --fix",
"test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "website-image-scraper",
"title": "Website Image Scraper",
"description": "Crawls websites to index URLs and download all images without duplicates",
"version": "1.0",
"buildTag": "latest",
"input": "./input_schema.json",
"meta": {
"templateId": "js-empty"
},
"dockerfile": "../Dockerfile",
"defaultRunOptions": {
"build": "latest",
"timeoutSecs": 3600,
"memoryMbytes": 1024
}
}

.actor/input_schema.json

{
"title": "Image Scraper Input",
"description": "Configure the image scraper to crawl websites and download all images without duplicates",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrl": {
"title": "Start URL",
"type": "string",
"description": "The URL of the website to start scraping images from",
"prefill": "https://example.com",
"example": "https://example.com",
"editor": "textfield"
},
"maxCrawlDepth": {
"title": "Maximum Crawl Depth",
"type": "integer",
"description": "How deep should the crawler navigate through links (0 = only start URL, 1 = start URL + direct links, etc.)",
"default": 1,
"prefill": 1,
"example": 2,
"minimum": 0,
"maximum": 5,
"editor": "number"
},
"maxConcurrency": {
"title": "Maximum Concurrency",
"type": "integer",
"description": "Number of pages processed in parallel. Higher values = faster crawling but more resource usage",
"default": 10,
"prefill": 10,
"example": 5,
"minimum": 1,
"maximum": 50,
"editor": "number"
},
"outputDir": {
"title": "Output Directory",
"type": "string",
"description": "Directory path where downloaded images will be saved",
"default": "./images",
"prefill": "./images",
"example": "./downloaded_images",
"editor": "textfield"
},
"imageExtensions": {
"title": "Image Extensions",
"type": "array",
"description": "List of image file extensions to search for and download",
"default": ["jpg", "jpeg", "png", "gif", "webp", "bmp", "svg"],
"prefill": ["jpg", "jpeg", "png", "gif", "webp", "bmp", "svg"],
"example": ["jpg", "png", "gif"],
"editor": "stringList"
},
"respectRobotsTxt": {
"title": "Respect robots.txt",
"type": "boolean",
"description": "Whether to respect robots.txt files when crawling",
"default": true,
"prefill": true,
"sectionCaption": "Advanced Settings",
"sectionDescription": "These settings are for advanced users who want to fine-tune the scraper behavior"
},
"userAgent": {
"title": "User Agent",
"type": "string",
"description": "Custom User-Agent string to use for requests",
"default": "Mozilla/5.0 (compatible; ApifyBot/1.0; +https://apify.com/bot)",
"prefill": "Mozilla/5.0 (compatible; ApifyBot/1.0; +https://apify.com/bot)",
"editor": "textfield"
},
"downloadTimeout": {
"title": "Download Timeout (seconds)",
"type": "integer",
"description": "Maximum time to wait for each image download",
"default": 30,
"prefill": 30,
"minimum": 5,
"maximum": 300,
"editor": "number"
}
},
"required": ["startUrl"]
}

src/main.js

1import path from 'node:path';
2
3import { Actor, log } from 'apify';
4import * as cheerio from 'cheerio';
5import got from 'got';
6
7await Actor.init();
8
9const input = await Actor.getInput();
10const {
11 startUrl,
12 maxCrawlDepth = 1,
13 maxConcurrency = 10,
14 imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'svg']
15} = input;
16
17if (!startUrl) throw new Error('startUrl is required!');
18
19log.info('Starting Image URL Scraper', { startUrl, maxCrawlDepth, maxConcurrency });
20
21const requestQueue = await Actor.openRequestQueue();
22await requestQueue.addRequest({ url: startUrl, userData: { depth: 0 } });
23
24const processedUrls = new Set();
25const foundImages = [];
26
27const crawlPage = async (request) => {
28 const { url, userData: { depth } } = request;
29
30 log.info(`Processing ${url} (depth: ${depth})`);
31
32 if (processedUrls.has(url)) {
33 log.info(`Already processed ${url}, skipping.`);
34 return;
35 }
36 processedUrls.add(url);
37
38 let body;
39 try {
40 const response = await got(url, { timeout: { request: 10000 } });
41 body = response.body;
42 } catch (error) {
43 log.error(`Failed to download ${url}: ${error.message}`);
44 return;
45 }
46
47 const $ = cheerio.load(body);
48
49 // Find images
50 const imagesOnPage = [];
51
52 // 1) img[src]
53 $('img[src]').each((_, el) => {
54 const src = $(el).attr('src');
55 if (src) imagesOnPage.push(src);
56 });
57
58 // 2) background images via style attributes or computed CSS (basic)
59 $('[style]').each((_, el) => {
60 const style = $(el).attr('style') || '';
61 const match = style.match(/background-image:\s*url\(["']?([^"')]+)["']?\)/i);
62 if (match) imagesOnPage.push(match[1]);
63 });
64
65 // Filter valid absolute URLs and extensions
66 const filteredImages = imagesOnPage
67 .map(src => {
68 // make absolute URL
69 try {
70 return new URL(src, url).href;
71 } catch {
72 return null;
73 }
74 })
75 .filter(src => src && imageExtensions.includes(path.extname(src).substring(1).toLowerCase()));
76
77 // Deduplicate
78 const uniqueImages = [...new Set(filteredImages)];
79
80 log.info(`Found ${uniqueImages.length} images on ${url}`);
81
82 // Store images with metadata
83 for (const imgUrl of uniqueImages) {
84 foundImages.push({
85 url: imgUrl,
86 sourcePage: url,
87 detectedAt: new Date().toISOString(),
88 });
89 }
90
91 // If depth limit not reached, enqueue links on the page
92 if (depth < maxCrawlDepth) {
93 const links = [];
94 $('a[href]').each((_, el) => {
95 const href = $(el).attr('href');
96 if (!href) return;
97 try {
98 const absoluteUrl = new URL(href, url).href;
99 links.push(absoluteUrl);
100 } catch {
101 // ignore invalid URLs
102 }
103 });
104
105 const uniqueLinks = [...new Set(links)];
106
107 for (const link of uniqueLinks) {
108 if (!processedUrls.has(link)) {
109 await requestQueue.addRequest({
110 url: link,
111 userData: { depth: depth + 1 }
112 });
113 }
114 }
115
116 log.info(`Enqueued ${uniqueLinks.length} links from ${url}`);
117 }
118};
119
120// Worker loop with concurrency
121const concurrency = Math.min(maxConcurrency, 20);
122const promises = [];
123
124for (let i = 0; i < concurrency; i++) {
125 promises.push((async () => {
126 while (true) {
127 const request = await requestQueue.fetchNextRequest();
128 if (!request) break;
129
130 try {
131 await crawlPage(request);
132 await requestQueue.markRequestHandled(request);
133 } catch (err) {
134 log.error(`Error crawling ${request.url}: ${err.message}`);
135 await requestQueue.markRequestFailed(request);
136 }
137 }
138 })());
139}
140
141await Promise.all(promises);
142
143// Push final result to dataset
144await Actor.pushData({
145 totalUrlsProcessed: processedUrls.size,
146 totalImagesFound: foundImages.length,
147 images: foundImages,
148});
149
150log.info('Crawl finished.');
151await Actor.exit();