fetch-ecommerce-image avatar
fetch-ecommerce-image
Under maintenance
Try for free

No credit card required

View all Actors
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
fetch-ecommerce-image

fetch-ecommerce-image

red_vault/fetch-ecommerce-image
Try for free

No credit card required

Get amazon and flipkart product image

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "fetch-amazon-image",
4    "title": "Project Cheerio Crawler Javascript",
5    "description": "Crawlee and Cheerio project in javascript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-cheerio"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "CheerioCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "pageurl": {
7            "title": "URL",
8            "type": "string",
9            "editor": "textfield",
10            "description": "Product URL",
11            "default": "https://crawlee.dev"
12        },
13        "asincode": {
14            "title": "asincode",
15            "type": "string",
16            "editor": "textfield",
17            "description": "Asin code for the product",
18            "default": "B0B4N77Y34"
19        },
20        "type": {
21            "title": "Type: asin or search",
22            "type": "string",
23            "editor": "textfield",
24            "description": "Must be one from asin or search",
25            "default": "asin"
26        },
27        "service": {
28            "title": "Type: amazon or flipkart",
29            "type": "string",
30            "editor": "textfield",
31            "description": "Must be one amazon or flipkart",
32            "default": "amazon"
33        },
34        "maxRequestsPerCrawl": {
35            "title": "Max Requests per Crawl",
36            "type": "integer",
37            "description": "Maximum number of requests that can be made by this crawler.",
38            "default": 100
39        }
40    }
41}

src/main.js

1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
2import { Actor } from 'apify';
3// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
4import { CheerioCrawler, Dataset } from 'crawlee';
5// this is ESM project, and as such, it requires you to specify extensions in your relative imports
6// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
7// import { router } from './routes.js';
8
9// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
10await Actor.init();
11
12// Structure of input is defined in input_schema.json
13const {
14    pageurl = 'https://crawlee.dev',
15    asincode = 'B0B4N77Y34',
16    type = 'asin',
17    service = 'amazon',
18    maxRequestsPerCrawl = 100,
19} = await Actor.getInput() ?? {};
20 
21const proxyConfiguration = await Actor.createProxyConfiguration();
22
23if(type === 'search' && service === 'amazon'){
24    const crawler = new CheerioCrawler({
25        maxRequestRetries: 5,
26        proxyConfiguration,
27        maxRequestsPerCrawl,
28        async requestHandler({ request, $, log }) {
29            log.info('enqueueing new URLs');
30
31            const image = $('#landingImage').attr('src');
32            log.info('Log', { url: request.loadedUrl, image });
33
34            await Dataset.pushData({ image })
35        },
36    });
37
38    await crawler.run([pageurl]);
39}
40
41if(type === 'asin' && service === 'amazon'){
42    const crawler = new CheerioCrawler({
43        maxRequestRetries: 5,
44        proxyConfiguration,
45        maxRequestsPerCrawl,
46        async requestHandler({ request, $, log }) {
47            log.info('enqueueing new URLs');
48
49            const image = $('#aod-asin-image-id').attr('src');
50            log.info('Log', { url: request.loadedUrl, image });
51
52            await Dataset.pushData({ image })
53        },
54    });
55
56    await crawler.run([`https://www.amazon.in/gp/aod/ajax?asin=${asincode}&ref=auto_load_aod&pc=dp`]);
57}
58
59if(type === 'search' && service === 'flipkart'){
60    const crawler = new CheerioCrawler({
61        maxRequestRetries: 5,
62        proxyConfiguration,
63        maxRequestsPerCrawl,
64        async requestHandler({ request, $, log }) {
65            log.info('enqueueing new URLs');
66            
67            const ogImage = $('meta[property="og:image"]').attr('content');
68            const addToCartButton = $('button:contains("Add to cart")');
69            const buyNowButton = $('button:contains("Buy Now")');
70    
71            if (ogImage) {
72                log.info('Log', { url: request.loadedUrl });
73
74                await Dataset.pushData({ image: ogImage });
75            } else if(addToCartButton.length == 0 && buyNowButton.length == 0) {
76                // get the first image from the search result
77
78                const images = $('div._1YokD2._2GoDe3 > div:nth-child(2) > div:nth-child(2) > div > div:nth-child(1) > div > a._2rpwqI > div:nth-child(1) > div > div > img');
79                const image = images[0];
80
81                const category = $("#container > div > div._36fx1h._6t1WkM._3HqJxg > div._1YokD2._2GoDe3 > div:nth-child(2) > div:nth-child(2) > div > div:nth-child(1) > div > a > div:nth-child(1) > div > div > div > img")
82                const categoryImage = category[0];
83
84                if(image){
85                    log.info('Log', { url: request.loadedUrl });
86
87                    await Dataset.pushData({ image: image.attribs.src });
88                }else if(categoryImage){
89                    log.info('Log', { url: request.loadedUrl });
90
91                    await Dataset.pushData({ image: categoryImage.attribs.src });
92                }
93            } else if(addToCartButton.length > 0 && buyNowButton.length > 0){
94                // get the landing page image
95
96                const image = $('img[loading="eager"]')[0];
97                if(image){
98                    log.info('Log', { url: request.loadedUrl });
99
100                    await Dataset.pushData({ image: image.attribs.src });
101                }
102            } else {
103                // If no Open Graph image is found, use a fallback (e.g., favicon)
104
105                const favicon = $('link[rel="icon"]').attr('href');
106                if (favicon) {
107                    log.info('Log', { url: request.loadedUrl });
108
109                    await Dataset.pushData({ image: favicon });
110                }
111            }
112        },
113    });
114
115    await crawler.run([pageurl]);
116}
117
118// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
119await Actor.exit();

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

oldmain.js

1// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/)
2import { Actor } from 'apify';
3// Crawlee - web scraping and browser automation library (Read more at https://crawlee.dev)
4import { CheerioCrawler, Dataset } from 'crawlee';
5// this is ESM project, and as such, it requires you to specify extensions in your relative imports
6// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
7// import { router } from './routes.js';
8
9// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init()
10await Actor.init();
11
12// Structure of input is defined in input_schema.json
13const {
14    scrapurl = 'https://crawlee.dev',
15    type = 'asin',
16    maxRequestsPerCrawl = 100,
17} = await Actor.getInput() ?? {};
18
19const proxyConfiguration = await Actor.createProxyConfiguration();
20
21if(type === 'asin'){
22    const crawler = new CheerioCrawler({
23        maxRequestRetries: 5,
24        proxyConfiguration,
25        maxRequestsPerCrawl,
26        async requestHandler({ request, $, log }) {
27            log.info('enqueueing new URLs');
28
29            // Extract title from the page.
30            //const title = $('title').text();
31
32            const image = $('#landingImage').attr('src');
33            log.info('Log', { url: request.loadedUrl, image: image });
34
35            if(!image) return request.pushErrorMessage("Failed to fetch image");
36
37            // Save url and title to Dataset - a table-like storage.
38            await Dataset.pushData({ image })
39        },
40    });
41
42    await crawler.run([scrapurl]);
43}
44
45// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit()
46await Actor.exit();

package.json

1{
2    "name": "crawlee-cheerio-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is a boilerplate of an Apify actor.",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.1.10",
11        "crawlee": "^3.5.4"
12    },
13    "devDependencies": {
14        "@apify/eslint-config": "^0.4.0",
15        "eslint": "^8.50.0"
16    },
17    "scripts": {
18        "start": "node src/main.js",
19        "lint": "eslint ./src --ext .js,.jsx",
20        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
21        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
22    },
23    "author": "It's not you it's me",
24    "license": "ISC"
25}
Developer
Maintained by Community
Actor metrics
  • 9 monthly users
  • 1 star
  • 100.0% runs succeeded
  • Created in Feb 2024
  • Modified 3 months ago
Categories