Bzj Amazon Actor avatar

Bzj Amazon Actor

Under maintenance
Go to Store
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
Bzj Amazon Actor

Bzj Amazon Actor

fateful_orangerie/bzj-amazon-actor

Crawl and extract unlimited data using actors integrated with the scrapeless amazon scraper api

Developer
Maintained by Community

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38    && npm install --omit=dev --omit=optional \
39    && echo "Installed NPM packages:" \
40    && (npm list --omit=dev --all || true) \
41    && echo "Node.js version:" \
42    && node --version \
43    && echo "NPM version:" \
44    && npm --version \
45    && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY --from=builder /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "bzj-amazon-actor",
4  "title": "Scrape single page in TypeScript",
5  "description": "Scrape data from single page with provided URL.",
6  "version": "0.0",
7  "meta": {
8    "templateId": "ts-start"
9  },
10  "input": "./input_schema.json",
11  "dockerfile": "./Dockerfile",
12  "storages": {
13    "dataset": "./dataset_schema.json"
14  }
15}

.actor/dataset_schema.json

1{
2  "actorSpecification": 1,
3  "views": {
4    "overview": {
5      "title": "Overview",
6      "transformation": {
7        "fields": [
8          "count",
9          "data",
10          "code",
11          "message"
12        ]
13      },
14      "display": {
15        "component": "table",
16        "properties": {
17          "count": {
18            "label": "count",
19            "format": "text"
20          },
21          "data": {
22            "label": "data",
23            "format": "object"
24          },
25          "code": {
26            "label": "code",
27            "format": "text"
28          },
29          "message": {
30            "label": "message",
31            "format": "text"
32          }
33        }
34      }
35    }
36  }
37}

.actor/input_schema.json

1{
2  "title": "Actor BZJ",
3  "type": "object",
4  "schemaVersion": 1,
5  "properties": {
6    "apiKey": {
7      "title": "API Key",
8      "type": "string",
9      "editor": "textfield",
10      "description": "Start getting your [API KEY](https://app.scrapeless.com/dashboard/account?tab=apiKey) for free"
11    },
12    "action": {
13      "title": "Scraper Action",
14      "type": "string",
15      "enum": [
16        "keywords",
17        "product",
18        "seller"
19      ],
20      "description": "Amazon Scraper action types used for crawling",
21      "prefill": "keywords"
22    },
23    "webhook": {
24      "title": "webhook",
25      "type": "string",
26      "editor": "textfield",
27      "description": "webhook URL to send the data to",
28      "default": ""
29    },
30    "keywords": {
31      "title": "Keywords",
32      "sectionCaption": "keywords options",
33      "sectionDescription": "Configuration of action Product",
34      "type": "string",
35      "editor": "textfield",
36      "description": "Amazon keywords to search for",
37      "default": "iphone 12",
38      "prefill": "iPhone 12"
39    },
40    "maxConcurrency": {
41      "title": "Maximum concurrency",
42      "type": "integer",
43      "maximum": 100,
44      "description": "Maximum concurrency to use for crawling",
45      "default": 10,
46      "prefill": 10
47    },
48    "pages": {
49      "title": "Pages",
50      "type": "integer",
51      "maximum": 100,
52      "description": "Total number of pages crawled",
53      "default": 3,
54      "prefill": 3
55    },
56    "domain": {
57      "title": "Domain",
58      "type": "string",
59      "editor": "textfield",
60      "description": "Amazon domain",
61      "default": "com",
62      "prefill": "com"
63    },
64    "productUrl": {
65      "title": "Product details URL",
66      "sectionCaption": "product options",
67      "sectionDescription": "Configuration of action Product",
68      "type": "string",
69      "editor": "textfield",
70      "description": "Amazon product details URL",
71      "prefill": "https://www.amazon.com/dp/B0BQXHK363"
72    },
73    "sellerUrl": {
74      "title": "seller details URL",
75      "sectionCaption": "seller options",
76      "sectionDescription": "Configuration of action seller",
77      "type": "string",
78      "editor": "textfield",
79      "description": "Amazon seller details URL",
80      "prefill": "https://www.amazon.com/dp/B0BQXHK363"
81    }
82  },
83  "required": [
84    "apiKey",
85    "action"
86  ]
87}

src/main.ts

1import { Actor, log } from 'apify';
2import Scrapeless from 'scrapeless-sdk-node';
3
4await Actor.init();
5
6enum AmazonActionEnum {
7  product = 'product',
8  seller = 'seller',
9  keywords = 'keywords',
10}
11
12interface Input {
13  apiKey: string;
14  action: AmazonActionEnum;
15  webhook: string;
16  productUrl: string;
17  sellerUrl: string;
18  keywords: string;
19  maxConcurrency: number
20  pages: number;
21  domain: string;
22}
23
24const {
25    apiKey = '',
26    action = AmazonActionEnum.keywords,
27    webhook = '',
28    keywords = 'iPhone 12',
29    domain = 'com',
30    pages = 3,
31    maxConcurrency = 10,
32    productUrl = 'https://www.amazon.com/dp/B0BQXHK363',
33    sellerUrl = 'https://www.amazon.com/dp/B0BQXHK363',
34} = await Actor.getInput<Input>() ?? {};
35
36const CONCURRENCY_LIMIT = pages < maxConcurrency ? pages : maxConcurrency;
37
38// @ts-expect-error scrapeless-sdk-node
39const scrapeless = new Scrapeless({ apiKey });
40
41function getScrapelessInput(currentPage = 1) {
42    const baseInput = { action };
43    if (action === AmazonActionEnum.seller) {
44        return { ...baseInput, url: sellerUrl };
45    }
46    if (action === AmazonActionEnum.product) {
47        return { ...baseInput, url: productUrl };
48    }
49    // keywords
50    return { ...baseInput, keywords, page: currentPage.toString(), domain };
51}
52
53async function scraperFetch() {
54    const response = await scrapeless.scraper({
55        actor: 'scraper.amazon',
56        webhook,
57        input: getScrapelessInput(),
58    });
59    await Actor.pushData(response as object);
60}
61
62async function keywordsConcurrencyScraperFetch() {
63    const RequestQueue: (() => Promise<object>)[] = [];
64    for (let page = 1; page <= pages; page++) {
65        RequestQueue.push(() => {
66            return scrapeless.scraper({
67                actor: 'scraper.amazon',
68                webhook,
69                input: getScrapelessInput(page),
70            });
71        });
72    }
73
74    const successfulResults: object[] = [];
75    let currentIndex = 0;
76    async function worker() {
77        while (currentIndex < RequestQueue.length) {
78            try {
79                log.info(`[Current page number]: ${currentIndex + 1}`);
80                const result = await RequestQueue[currentIndex++]();
81                await Actor.pushData(result);
82                successfulResults.push(result);
83            } catch (error) {
84                log.error(`[Request failed]: ${error}`);
85            }
86        }
87    }
88
89    const workers = [];
90    for (let i = 1; i <= CONCURRENCY_LIMIT; i++) {
91        workers.push(worker());
92    }
93    await Promise.all(workers);
94    log.info(`[🎉 Successfully captured ${successfulResults.length} pages of data]`);
95    await Actor.setValue('OUTPUT', successfulResults);
96}
97
98if (action === AmazonActionEnum.keywords) {
99    await keywordsConcurrencyScraperFetch();
100} else {
101    await scraperFetch();
102}
103
104await Actor.exit();

.dockerignore

1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
15
16# dist folder
17dist

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "root": true,
3    "env": {
4        "browser": true,
5        "es2020": true,
6        "node": true
7    },
8    "extends": [
9        "@apify/eslint-config-ts"
10    ],
11    "parserOptions": {
12        "project": "./tsconfig.json",
13        "ecmaVersion": 2020
14    },
15    "ignorePatterns": [
16        "node_modules",
17        "dist",
18        "**/*.d.ts"
19    ]
20}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.vscode
5storage
6apify_storage
7crawlee_storage
8node_modules
9dist
10tsconfig.tsbuildinfo
11storage/*
12!storage/key_value_stores
13storage/key_value_stores/*
14!storage/key_value_stores/default
15storage/key_value_stores/default/*
16!storage/key_value_stores/default/INPUT.json
17
18# Added by Apify CLI
19.venv

package.json

1{
2	"name": "bzj-amazon-actor",
3	"version": "0.0.1",
4	"type": "module",
5	"description": "This is an example of an Apify actor.",
6	"engines": {
7		"node": ">=18.0.0"
8	},
9	"dependencies": {
10		"apify": "^3.2.6",
11		"axios": "^1.5.0",
12		"cheerio": "^1.0.0-rc.12",
13		"scrapeless-sdk-node": "^0.0.3"
14	},
15	"devDependencies": {
16		"@apify/eslint-config-ts": "^0.3.0",
17		"@apify/tsconfig": "^0.1.0",
18		"@typescript-eslint/eslint-plugin": "^7.18.0",
19		"@typescript-eslint/parser": "^7.18.0",
20		"eslint": "^8.50.0",
21		"tsx": "^4.6.2",
22		"typescript": "^5.3.3"
23	},
24	"scripts": {
25		"start": "npm run start:dev",
26		"start:prod": "node dist/main.js",
27		"start:dev": "tsx src/main.ts",
28		"build": "tsc",
29		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
30	},
31	"author": "It's not you it's me",
32	"license": "ISC"
33}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "lib": ["DOM"]
11    },
12    "include": [
13        "./src/**/*"
14    ]
15}