Amazon Crawler
View all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsAmazon Crawler
androlein/amazon-parser
Simple actor, which uses the keyword as a search input for Amazon, then crawl the first page including description, link to the item and all offers
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10"
9 },
10 "scripts": {
11 "start": "node main.js"
12 }
13}
main.js
1const Apify = require('apify');
2
3const handleFailedRequestFunction = ({ request }) => {
4 console.log(`Request ${request.url} failed 4 times`);
5};
6
7const baseURL = 'https://amazon.com';
8
9Apify.main(async () => {
10 const input = await Apify.getInput();
11 const dataset = await Apify.openDataset('amazon-dataset');
12 const { keyword, email } = input;
13 if (!keyword || !email) {
14 throw new Error('Please, provide both the keyword and the email');
15 }
16 const enqueueUrl = async (queue, url, userData = {}) => queue.addRequest({ url, userData });
17 const asinQueue = await Apify.openRequestQueue('asin');
18 const descQueue = await Apify.openRequestQueue('desc');
19 const offersQueue = await Apify.openRequestQueue('offers');
20 await enqueueUrl(asinQueue, `${baseURL}/s?k=${keyword}`); // initial url
21
22 const asinCrawler = new Apify.PuppeteerCrawler({
23 requestQueue: asinQueue,
24
25 handlePageFunction: async ({ page, request }) => {
26 console.log(`Request ${request.url} succeeded!`);
27 await Apify.utils.puppeteer.injectJQuery(page);
28 const data = await page.evaluate(() => {
29 const asins = [];
30 $('[data-asin]').each((index, value) => {
31 const asin = value.getAttribute('data-asin');
32 const itemUrl = $(value)
33 .find('a')
34 .attr('href');
35 asins.push({
36 itemUrl,
37 asin,
38 });
39 });
40 return asins;
41 });
42
43 data.forEach((value) => {
44 if (value.itemUrl.match('http')) enqueueUrl(descQueue, value.itemUrl, value);
45 else {
46 enqueueUrl(descQueue, `${baseURL}${value.itemUrl}`, {
47 ...value,
48 itemUrl: `${baseURL}${value.itemUrl}`,
49 });
50 }
51 });
52 },
53
54 // If request failed 4 times then this function is executed.
55 handleFailedRequestFunction,
56 });
57
58 const descCrawler = new Apify.PuppeteerCrawler({
59 requestQueue: descQueue,
60
61 handlePageFunction: async ({ page, request }) => {
62 console.log(`Request ${request.url} succeeded!`);
63 await Apify.utils.puppeteer.injectJQuery(page);
64 const { description, title } = await page.evaluate(() => {
65 const description = $('#productDescription')
66 .find('p')
67 .text()
68 .trim();
69 const titleMatchResult = document.title.match('(?<=:).*?(?=:)');
70 const title = titleMatchResult ? titleMatchResult[0].trim() : document.title.trim(); // removing amazon and category text
71 return { description, title };
72 });
73 enqueueUrl(
74 offersQueue,
75 `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=0`,
76 {
77 ...request.userData,
78 title,
79 description,
80 offers: [], // initialize offers to prevent crash when there are more than one page
81 startIndex: 0,
82 },
83 );
84 },
85 handleFailedRequestFunction,
86 });
87
88 const offersCrawler = new Apify.PuppeteerCrawler({
89 requestQueue: offersQueue,
90 handlePageFunction: async ({ page, request }) => {
91 console.log(`Request ${request.url} succeeded!`);
92 await Apify.utils.puppeteer.injectJQuery(page);
93 const { offersList: offers, isOffersEnded } = await page.evaluate(() => {
94 const offersList = [];
95 $('.olpOffer').each((index, element) => {
96 const sellerName = $(element)
97 .find('.olpSellerName')
98 .text()
99 .trim();
100 const shippingBlock = $(element)
101 .find('.olpShippingInfo')
102 .text()
103 .trim();
104 const shipping = shippingBlock.toLowerCase().match('free')
105 ? 'free'
106 : shippingBlock.replace(/\s+/g, ' ');
107 const offer = $(element)
108 .find('.olpOfferPrice')
109 .text()
110 .trim();
111 offersList.push({ sellerName, shipping, offer });
112 });
113 const isOffersEnded = $('.a-disabled.a-last').length || !$('.a-pagination').length;
114 return { offersList, isOffersEnded };
115 });
116 if (isOffersEnded) {
117 delete request.userData.asin;
118 delete request.userData.startIndex;
119 dataset.pushData({ ...request.userData, keyword, offers: [...request.userData.offers, ...offers] });
120 } else {
121 enqueueUrl(
122 offersQueue,
123 `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=${request
124 .userData.startIndex + 10}`,
125 {
126 ...request.userData,
127 offers: [...request.userData.offers, ...offers],
128 startIndex: request.userData.startIndex + 10,
129 },
130 );
131 }
132 },
133 handleFailedRequestFunction,
134 });
135
136 // Run crawlers.
137 await asinCrawler.run();
138 await descCrawler.run();
139 await offersCrawler.run();
140 const datasetInfo = await dataset.getInfo();
141 // send email
142 await Apify.call('apify/send-mail', {
143 to: email,
144 subject: 'Your data from apify',
145 text: `Here is your data from apify actor, https://api.apify.com/v2/datasets/${
146 datasetInfo.id
147 }/items?format=json`,
148 });
149});
Developer
Maintained by Community
Categories