
Website Image Scraper
Pricing
$2.00 / 1,000 images
Go to Store

Website Image Scraper
Website Image Scraper is a fast, lightweight tool that crawls websites to extract image URLs (jpg, png, svg) without downloading files or using browsers. It supports recursive crawling, respects robots.txt, and efficiently collects image links for analysis or monitoring or a later download.
0.0 (0)
Pricing
$2.00 / 1,000 images
0
Total users
2
Monthly users
2
Runs succeeded
>99%
Last modified
3 days ago
.dockerignore
# configurations.idea.vscode.zed
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lfquote_type = single
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.idea.vscode.zednode_modulesstorage
# Added by Apify CLI.venv
.prettierrc
{ "printWidth": 120, "tabWidth": 4, "singleQuote": true}
Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:22
# Check preinstalled packagesRUN npm ls crawlee apify puppeteer playwright
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Create and run as a non-root user.RUN adduser -h /home/apify -D apify && \ chown -R apify:apify ./USER apify
# Run the image.CMD npm start --silent
eslint.config.mjs
1import prettier from 'eslint-config-prettier';2
3import apify from '@apify/eslint-config/js.js';4
5// eslint-disable-next-line import/no-default-export6export default [{ ignores: ['**/dist'] }, ...apify, prettier];
package.json
{ "name": "website-image-crawler", "version": "0.0.1", "type": "module", "description": "This is a boilerplate of an Apify Actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.4.2", "cheerio": "^1.1.0", "got": "^14.4.7" }, "devDependencies": { "@apify/eslint-config": "^1.0.0", "eslint": "^9.29.0", "eslint-config-prettier": "^10.1.5", "prettier": "^3.5.3" }, "scripts": { "start": "node ./src/main.js", "format": "prettier --write .", "format:check": "prettier --check .", "lint": "eslint", "lint:fix": "eslint --fix", "test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}
.actor/actor.json
{ "actorSpecification": 1, "name": "website-image-scraper", "title": "Website Image Scraper", "description": "Crawls websites to index URLs and download all images without duplicates", "version": "1.0", "buildTag": "latest", "input": "./input_schema.json", "meta": { "templateId": "js-empty" }, "dockerfile": "../Dockerfile", "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 1024 }}
.actor/input_schema.json
{ "title": "Image Scraper Input", "description": "Configure the image scraper to crawl websites and download all images without duplicates", "type": "object", "schemaVersion": 1, "properties": { "startUrl": { "title": "Start URL", "type": "string", "description": "The URL of the website to start scraping images from", "prefill": "https://example.com", "example": "https://example.com", "editor": "textfield" }, "maxCrawlDepth": { "title": "Maximum Crawl Depth", "type": "integer", "description": "How deep should the crawler navigate through links (0 = only start URL, 1 = start URL + direct links, etc.)", "default": 1, "prefill": 1, "example": 2, "minimum": 0, "maximum": 5, "editor": "number" }, "maxConcurrency": { "title": "Maximum Concurrency", "type": "integer", "description": "Number of pages processed in parallel. Higher values = faster crawling but more resource usage", "default": 10, "prefill": 10, "example": 5, "minimum": 1, "maximum": 50, "editor": "number" }, "outputDir": { "title": "Output Directory", "type": "string", "description": "Directory path where downloaded images will be saved", "default": "./images", "prefill": "./images", "example": "./downloaded_images", "editor": "textfield" }, "imageExtensions": { "title": "Image Extensions", "type": "array", "description": "List of image file extensions to search for and download", "default": ["jpg", "jpeg", "png", "gif", "webp", "bmp", "svg"], "prefill": ["jpg", "jpeg", "png", "gif", "webp", "bmp", "svg"], "example": ["jpg", "png", "gif"], "editor": "stringList" }, "respectRobotsTxt": { "title": "Respect robots.txt", "type": "boolean", "description": "Whether to respect robots.txt files when crawling", "default": true, "prefill": true, "sectionCaption": "Advanced Settings", "sectionDescription": "These settings are for advanced users who want to fine-tune the scraper behavior" }, "userAgent": { "title": "User Agent", "type": "string", "description": "Custom User-Agent string to use for requests", "default": "Mozilla/5.0 (compatible; ApifyBot/1.0; +https://apify.com/bot)", "prefill": "Mozilla/5.0 (compatible; ApifyBot/1.0; +https://apify.com/bot)", "editor": "textfield" }, "downloadTimeout": { "title": "Download Timeout (seconds)", "type": "integer", "description": "Maximum time to wait for each image download", "default": 30, "prefill": 30, "minimum": 5, "maximum": 300, "editor": "number" } }, "required": ["startUrl"]}
src/main.js
1import path from 'node:path';2
3import { Actor, log } from 'apify';4import * as cheerio from 'cheerio';5import got from 'got';6
7await Actor.init();8
9const input = await Actor.getInput();10const {11 startUrl,12 maxCrawlDepth = 1,13 maxConcurrency = 10,14 imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp', 'svg']15} = input;16
17if (!startUrl) throw new Error('startUrl is required!');18
19log.info('Starting Image URL Scraper', { startUrl, maxCrawlDepth, maxConcurrency });20
21const requestQueue = await Actor.openRequestQueue();22await requestQueue.addRequest({ url: startUrl, userData: { depth: 0 } });23
24const processedUrls = new Set();25const foundImages = [];26
27const crawlPage = async (request) => {28 const { url, userData: { depth } } = request;29
30 log.info(`Processing ${url} (depth: ${depth})`);31
32 if (processedUrls.has(url)) {33 log.info(`Already processed ${url}, skipping.`);34 return;35 }36 processedUrls.add(url);37
38 let body;39 try {40 const response = await got(url, { timeout: { request: 10000 } });41 body = response.body;42 } catch (error) {43 log.error(`Failed to download ${url}: ${error.message}`);44 return;45 }46
47 const $ = cheerio.load(body);48
49 // Find images50 const imagesOnPage = [];51
52 // 1) img[src]53 $('img[src]').each((_, el) => {54 const src = $(el).attr('src');55 if (src) imagesOnPage.push(src);56 });57
58 // 2) background images via style attributes or computed CSS (basic)59 $('[style]').each((_, el) => {60 const style = $(el).attr('style') || '';61 const match = style.match(/background-image:\s*url\(["']?([^"')]+)["']?\)/i);62 if (match) imagesOnPage.push(match[1]);63 });64
65 // Filter valid absolute URLs and extensions66 const filteredImages = imagesOnPage67 .map(src => {68 // make absolute URL69 try {70 return new URL(src, url).href;71 } catch {72 return null;73 }74 })75 .filter(src => src && imageExtensions.includes(path.extname(src).substring(1).toLowerCase()));76
77 // Deduplicate78 const uniqueImages = [...new Set(filteredImages)];79
80 log.info(`Found ${uniqueImages.length} images on ${url}`);81
82 // Store images with metadata83 for (const imgUrl of uniqueImages) {84 foundImages.push({85 url: imgUrl,86 sourcePage: url,87 detectedAt: new Date().toISOString(),88 });89 }90
91 // If depth limit not reached, enqueue links on the page92 if (depth < maxCrawlDepth) {93 const links = [];94 $('a[href]').each((_, el) => {95 const href = $(el).attr('href');96 if (!href) return;97 try {98 const absoluteUrl = new URL(href, url).href;99 links.push(absoluteUrl);100 } catch {101 // ignore invalid URLs102 }103 });104
105 const uniqueLinks = [...new Set(links)];106
107 for (const link of uniqueLinks) {108 if (!processedUrls.has(link)) {109 await requestQueue.addRequest({110 url: link,111 userData: { depth: depth + 1 }112 });113 }114 }115
116 log.info(`Enqueued ${uniqueLinks.length} links from ${url}`);117 }118};119
120// Worker loop with concurrency121const concurrency = Math.min(maxConcurrency, 20);122const promises = [];123
124for (let i = 0; i < concurrency; i++) {125 promises.push((async () => {126 while (true) {127 const request = await requestQueue.fetchNextRequest();128 if (!request) break;129
130 try {131 await crawlPage(request);132 await requestQueue.markRequestHandled(request);133 } catch (err) {134 log.error(`Error crawling ${request.url}: ${err.message}`);135 await requestQueue.markRequestFailed(request);136 }137 }138 })());139}140
141await Promise.all(promises);142
143// Push final result to dataset144await Actor.pushData({145 totalUrlsProcessed: processedUrls.size,146 totalImagesFound: foundImages.length,147 images: foundImages,148});149
150log.info('Crawl finished.');151await Actor.exit();