skyBox-AmazonScrapper avatar
skyBox-AmazonScrapper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
skyBox-AmazonScrapper

skyBox-AmazonScrapper

asadali/skybox

It is a Amazon scrapper that scraps product ranking at a specific category and sellers information such as seller name, address, and contact information.

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:16
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor-1",
4    "title": "Project Puppeteer Crawler JavaScript",
5    "description": "Crawlee and Puppeteer project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-puppeteer-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with.",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://apify.com"
14                }
15            ]
16        },
17        "ASIN": {
18            "title": "ASIN Number",
19            "type": "string",
20            "description": "The Asin number of the amazon product",
21            "editor": "textfield"
22        },
23        "MaxPageRetries":{
24            "title":"Max Page Retry",
25            "type": "integer",
26            "description": "The maximum number of retries after failure.",
27            "minimum": 0,
28            "default": 5
29        },
30        "MaxConcurrency":{
31            "title": "Max Concurrency",
32            "type": "integer",
33            "description": "The maximum number of cuncurrent tasks running at a time.",
34            "minimum": 0,
35            "default": 5
36        },
37        "Proxy":{
38            "sectionCaption": "Proxy Configuration",
39            "title": "Proxy configuration",
40            "type": "object",
41            "description": "Select proxies to be used by your crawler.",
42            "prefill": { "useApifyProxy": true },
43            "editor": "proxy"
44        }
45    }
46
47}

src/main.js

1import { Actor } from 'apify';
2import { PuppeteerCrawler } from 'crawlee';
3import cheerio from 'cheerio';
4import { router } from './routes.js';
5
6// Initialize the Apify SDK
7await Actor.init();
8
9const input = await Actor.getInput();
10const { MaxPageRetries, startUrls } = input;
11
12const proxyConfiguration = await Actor.createProxyConfiguration({
13    groups: ['RESIDENTIAL'],
14    countryCode: 'GB',
15});
16
17const crawler = new PuppeteerCrawler({
18    proxyConfiguration,
19    maxRequestRetries: MaxPageRetries,
20    failedRequestHandler: async ({ request }) => {
21        console.log(`Request ${request.url} failed too many times.`);
22    },
23    requestHandler: router,
24});
25
26await crawler.run([startUrls[0].url]);
27
28await Actor.exit()

src/routes.js

1import { Actor } from 'apify';
2// import { PuppeteerCrawler } from 'crawlee';
3import cheerio from 'cheerio';
4import getSellerInformation from './sellerInfo.js';
5import { createPuppeteerRouter } from 'crawlee';
6
7await Actor.init();
8
9const input = await Actor.getInput();
10const { ASIN } = input;
11
12export const router = createPuppeteerRouter();
13
14router.addDefaultHandler(async ({ page, request }) => {
15    const content = await page.content();
16
17    let $ = cheerio.load(content);
18    
19    const asinNumbers = [];
20    
21    $('div[data-asin][data-component-type="s-search-result"]').each((index, element) => {
22        asinNumbers.push($(element).attr('data-asin'));
23    });
24
25    console.log(asinNumbers);
26
27    const specificASIN = ASIN; 
28    const placement = asinNumbers.indexOf(specificASIN) + 1; 
29    console.log(`Product ASIN ${specificASIN} is placed at position: ${placement} on the page`);
30
31
32    // getting seller information from here...
33    if(ASIN){
34        // getSellerInformation(page, ASIN);
35        await page.goto(`https://www.amazon.com/dp/${ASIN}`)
36        await page.waitForSelector('#sellerProfileTriggerId');
37
38        await page.click('#sellerProfileTriggerId');
39        await page.waitForSelector('#effective-timeperiod-rating-year-description');
40
41        const sellerPageContent = await page.content();
42        $ = cheerio.load(sellerPageContent);
43
44        const sellerRating = $('#effective-timeperiod-rating-year-description').text().trim();    
45        const sellerName = $('#page-section-detail-seller-info > div > div > div > div:nth-child(2) > span:nth-child(2)').text().trim();
46        const sellerAddress = $('#page-section-detail-seller-info > div > div > div > div:nth-child(4)').text().trim();
47
48        console.log("Sellers Rating: ", sellerRating);
49        console.log("Seller Name: ", sellerName);
50        console.log("Seller Address: ", sellerAddress);
51
52        const dataset = await Actor.openDataset('SKYBOX');
53        await dataset.pushData({ASIN, placement, sellerRating, sellerName, sellerAddress});
54    }
55    // end of seller information.
56});
57
58router.addHandler('detail', async ({ page, request }) => {
59
60});

src/sellerInfo.js

1import cheerio from 'cheerio';
2
3
4export default async function getSellerInformation(page, ASIN){
5        page.goto(`https://www.amazon.com/dp/${ASIN}`)
6        await page.waitForSelector('#sellerProfileTriggerId');
7        console.log("hello")
8
9        await page.click('#sellerProfileTriggerId');
10        console.log('1')
11        await page.waitForSelector('#effective-timeperiod-rating-year-description');
12        console.log('2')
13
14        const sellerPageContent = await page.content();
15        console.log('3')
16
17        const $ = cheerio.load(sellerPageContent);
18        console.log('4')
19
20        const sellerRating = $('#effective-timeperiod-rating-year-description').text().trim();    
21        const sellerName = $('#page-section-detail-seller-info > div > div > div > div:nth-child(2) > span:nth-child(2)').text().trim();
22        const sellerAddress = $('#page-section-detail-seller-info > div > div > div > div:nth-child(4)').text().trim();
23
24        console.log("Sellers Rating: ", sellerRating);
25        console.log("Seller Name: ", sellerName);
26        console.log("Seller Address: ", sellerAddress);
27}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-puppeteer-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.0.0",
8        "crawlee": "^3.0.0",
9        "puppeteer": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.3.1",
13        "eslint": "^8.36.0"
14    },
15    "scripts": {
16        "start": "node src/main.js",
17        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18    },
19    "author": "It's not you it's me",
20    "license": "ISC"
21}
Developer
Maintained by Community