# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:18

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
    && npm install --omit=dev --omit=optional \
    && echo "Installed NPM packages:" \
    && (npm list --omit=dev --all || true) \
    && echo "Node.js version:" \
    && node --version \
    && echo "NPM version:" \
    && npm --version \
    && rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./


# Run the image.
CMD npm start --silent

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "fetch-amazon-image",
    "title": "Project Cheerio Crawler Javascript",
    "description": "Crawlee and Cheerio project in javascript.",
    "version": "0.0",
    "meta": {
        "templateId": "js-crawlee-cheerio"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "CheerioCrawler Template",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "pageurl": {
            "title": "URL",
            "type": "string",
            "editor": "textfield",
            "description": "Product URL",
            "default": "https://crawlee.dev"
        },
        "asincode": {
            "title": "asincode",
            "type": "string",
            "editor": "textfield",
            "description": "Asin code for the product",
            "default": "B0B4N77Y34"
        },
        "type": {
            "title": "Type: asin or search",
            "type": "string",
            "editor": "textfield",
            "description": "Must be one from asin or search",
            "default": "asin"
        },
        "service": {
            "title": "Type: amazon or flipkart",
            "type": "string",
            "editor": "textfield",
            "description": "Must be one amazon or flipkart",
            "default": "amazon"
        },
        "maxRequestsPerCrawl": {
            "title": "Max Requests per Crawl",
            "type": "integer",
            "description": "Maximum number of requests that can be made by this crawler.",
            "default": 100
        }
    }
}

src/main.js

1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
2import { Actor } from 'apify';
3// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
4import { CheerioCrawler, Dataset } from 'crawlee';
5import { PlaywrightCrawler } from 'crawlee';
6import { ProxyConfiguration } from 'apify';
7// this is ESM project, and as such, it requires you to specify extensions in your relative imports
8// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
9// import { router } from './routes.js';
10
11// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
12await Actor.init();
13
14// Structure of input is defined in input_schema.json
15const {
16    pageurl = 'https://www.amazon.in/gp/aod/ajax?asin=B0D945V84N&ref=auto_load_aod&pc=dp',
17    asincode = 'B0B4N77Y34',
18    type = 'asin',
19    service = 'amazon',
20    maxRequestsPerCrawl = 2,
21} = await Actor.getInput() ?? {};
22 
23//const proxyConfiguration = await Actor.createProxyConfiguration();
24const proxyConfiguration = new ProxyConfiguration({
25    groups: ['RESIDENTIAL'],
26    countryCode: 'US', // Optionally, you can specify the proxy country code.
27    // This is useful for sites like Amazon, which display different content based on the user's location.
28});
29
30if(type === 'search' && service === 'amazon'){
31    const crawler = new CheerioCrawler({
32        maxRequestRetries: 5,
33        proxyConfiguration,
34        maxRequestsPerCrawl,
35        async requestHandler({ request, $, log }) {
36            log.info('enqueueing new URLs');
37
38            const image = $('#landingImage').attr('src');
39            log.info('Log', { url: request.loadedUrl, image });
40
41            await Dataset.pushData({ image })
42        },
43    });
44
45    await crawler.run([pageurl]);
46}
47
48if(type === 'asin' && service === 'amazon'){
49    const crawler = new CheerioCrawler({
50        maxRequestRetries: 5,
51        proxyConfiguration,
52        maxRequestsPerCrawl,
53        handlePageFunction: ({ proxyInfo }) => {
54     const usedProxyUrl = proxyInfo.url; // Getting the proxy URL
55     log.info(usedProxyUrl)
56  },
57        async requestHandler({ request, $, log }) {
58            log.info('enqueueing new URLs');
59
60            const image = $('#aod-asin-image-id').attr('src');
61            log.info('Log', { url: request.loadedUrl, image });
62
63            await Dataset.pushData({ image })
64        },
65    });
66
67    await crawler.run([`https://www.amazon.in/gp/aod/ajax?asin=${asincode}&ref=auto_load_aod&pc=dp`]);
68}
69
70if(type === 'asin' && service === 'amazon_v2'){
71    const crawler = new PlaywrightCrawler({
72        maxRequestRetries: 5,
73        proxyConfiguration,
74        maxRequestsPerCrawl,
75        async requestHandler({ request, $, log }) {
76            log.info('enqueueing new URLs');
77
78            const image = $('#aod-asin-image-id').attr('src');
79            log.info('Log', { url: request.loadedUrl, image });
80
81            await Dataset.pushData({ image })
82        },
83    });
84
85    await crawler.run([`https://www.amazon.in/gp/aod/ajax?asin=${asincode}&ref=auto_load_aod&pc=dp`]);
86}
87
88if(type === 'search' && service === 'flipkart'){
89    const crawler = new CheerioCrawler({
90        maxRequestRetries: 5,
91        proxyConfiguration,
92        maxRequestsPerCrawl,
93        async requestHandler({ request, $, log }) {
94            log.info('enqueueing new URLs');
95            
96            const ogImage = $('meta[property="og:image"]').attr('content');
97            const addToCartButton = $('button:contains("Add to cart")');
98            const buyNowButton = $('button:contains("Buy Now")');
99    
100            if (ogImage) {
101                log.info('Log', { url: request.loadedUrl });
102
103                await Dataset.pushData({ image: ogImage });
104            } else if(addToCartButton.length == 0 && buyNowButton.length == 0) {
105                // get the first image from the search result
106
107                const images = $('div._1YokD2._2GoDe3 > div:nth-child(2) > div:nth-child(2) > div > div:nth-child(1) > div > a._2rpwqI > div:nth-child(1) > div > div > img');
108                const image = images[0];
109
110                const category = $("#container > div > div._36fx1h._6t1WkM._3HqJxg > div._1YokD2._2GoDe3 > div:nth-child(2) > div:nth-child(2) > div > div:nth-child(1) > div > a > div:nth-child(1) > div > div > div > img")
111                const categoryImage = category[0];
112
113                if(image){
114                    log.info('Log', { url: request.loadedUrl });
115
116                    await Dataset.pushData({ image: image.attribs.src });
117                }else if(categoryImage){
118                    log.info('Log', { url: request.loadedUrl });
119
120                    await Dataset.pushData({ image: categoryImage.attribs.src });
121                }
122            } else if(addToCartButton.length > 0 && buyNowButton.length > 0){
123                // get the landing page image
124
125                const image = $('img[loading="eager"]')[0];
126                if(image){
127                    log.info('Log', { url: request.loadedUrl });
128
129                    await Dataset.pushData({ image: image.attribs.src });
130                }
131            } else {
132                // If no Open Graph image is found, use a fallback (e.g., favicon)
133
134                const favicon = $('link[rel="icon"]').attr('href');
135                if (favicon) {
136                    log.info('Log', { url: request.loadedUrl });
137
138                    await Dataset.pushData({ image: favicon });
139                }
140            }
141        },
142    });
143
144    await crawler.run([pageurl]);
145}
146
147// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
148await Actor.exit();

.dockerignore

# configurations
.idea

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
    "extends": "@apify",
    "root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
dist
node_modules
apify_storage
storage

oldmain.js

1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
2import { Actor } from 'apify';
3// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
4import { CheerioCrawler, Dataset } from 'crawlee';
5// this is ESM project, and as such, it requires you to specify extensions in your relative imports
6// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
7// import { router } from './routes.js';
8
9// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
10await Actor.init();
11
12// Structure of input is defined in input_schema.json
13const {
14    scrapurl = 'https://crawlee.dev',
15    type = 'asin',
16    maxRequestsPerCrawl = 100,
17} = await Actor.getInput() ?? {};
18
19const proxyConfiguration = await Actor.createProxyConfiguration();
20
21if(type === 'asin'){
22    const crawler = new CheerioCrawler({
23        maxRequestRetries: 5,
24        proxyConfiguration,
25        maxRequestsPerCrawl,
26        async requestHandler({ request, $, log }) {
27            log.info('enqueueing new URLs');
28
29            // Extract title from the page.
30            //const title = $('title').text();
31
32            const image = $('#landingImage').attr('src');
33            log.info('Log', { url: request.loadedUrl, image: image });
34
35            if(!image) return request.pushErrorMessage("Failed to fetch image");
36
37            // Save url and title to Dataset - a table-like storage.
38            await Dataset.pushData({ image })
39        },
40    });
41
42    await crawler.run([scrapurl]);
43}
44
45// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
46await Actor.exit();

package.json

{
    "name": "crawlee-cheerio-javascript",
    "version": "0.0.1",
    "type": "module",
    "description": "This is a boilerplate of an Apify actor.",
    "engines": {
        "node": ">=18.0.0"
    },
    "dependencies": {
        "apify": "^3.1.10",
        "crawlee": "^3.5.4"
    },
    "devDependencies": {
        "@apify/eslint-config": "^0.4.0",
        "eslint": "^8.50.0"
    },
    "scripts": {
        "start": "node src/main.js",
        "lint": "eslint ./src --ext .js,.jsx",
        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
    },
    "author": "It's not you it's me",
    "license": "ISC"
}

mango product images actor

concrete2/mango-product-images-actor

Find all related images to products by product ID and color ID

George

Aliexpress Bulk Scraper Pro

hello.datawizards/aliexpress-bulk-scraper-pro

Aliexpress Bulk Scraper Pro: Scrape AliExpress product listings effortlessly with keyword searches. Get structured JSON data with product IDs, titles, prices, discounts, ratings, and images. Ideal for e-commerce research, price comparison, and market trends. Use Apify Proxy to avoid blocks.

datawizards

All In One Rednote (Xiaohongshu) Scraper

laishaohang/all-in-one-rednote-xiaohongshu-scraper

Effortlessly scrape Xiaohongshu (RedNote) data with this all-in-one tool: search notes by keywords/tracking, analyze comments, download media (videos/images). Insights for e-commerce, marketing, and research. Start free trial!

少航来

5.0

(3)

Pinterest Search Scraper

scraper-engine/pinterest-search-scraper

The Pinterest Search Scraper extracts data from Pinterest search results with ease. Collect pins, titles, descriptions, links, image URLs, and engagement stats. Ideal for marketers, researchers, and e-commerce owners to analyze trends, gather inspiration, or track competitors in bulk.

Scraper Engine

Amazon Product Scraper

scraper-engine/amazon-product-scraper

The Amazon Product Scraper Apify actor extracts detailed product data from Amazon, including titles, prices, reviews, ratings, images, and ASINs. Ideal for eCommerce analytics, price monitoring, and competitor research, it delivers structured JSON or CSV outputs ready for automation workflows.

Scraper Engine

Google Images Scraper

scraper-engine/google-images-scraper

Google Images Scraper collects image URLs, alt text, source pages, and metadata from Google Images. Use it as an API, with Python or Node.js, or via npm. Ideal for datasets, AI training, research, and automation. Exports in JSON, CSV, or Excel.

Scraper Engine

E-commerce Scraping Tool

apify/e-commerce-scraping-tool

Extract product data from any online retail platform, including Amazon, Walmart, and eBay. Add URLs of the products, and extract product name, price, currency, image, product description, product ID. Export scraped data, run scraper via API, schedule and monitor runs or integrate with other tools.

Apify

1.1K

3.4

(12)

Amazon Reviews Scraper

junglee/amazon-reviews-scraper

Amazon scraper to extract reviews from Amazon products. Scrape and download detailed reviews without using the Amazon API, including rating score, review description, reactions and images. Download your data as HTML table, JSON, CSV, Excel, XML.

Junglee

6.4K

1.6

(17)

Shein Product Description

pintostudio/shein-product-description

The Shein Product Description Actor is a specialized web scraping tool designed to fetch detailed product information from Shein's product catalog.

Pinto Studio

Goodreads Book Scraper

runtime/goodreads-book-scraper

Goodreads Book Scraper is an Apify Actor that extracts book details from Goodreads search results. It retrieves the title, author, rating, ratings count, published year, editions count, book URL, and cover image URL, outputting the data in structured JSON format.

scraping automation

5.0

(1)

Amazon Product Details Scraper

tpp/amazon-product-details-scraper

Amazon Product Details Scraper is your essential tool for accessing deep, detailed data from Amazon product pages. Designed to serve the needs of data analysts, market researchers, and e-commerce professionals, this scraper efficiently extracts critical information.