Amazon Crawler avatar
Amazon Crawler

Deprecated

Pricing

Pay per usage

Go to Store
Amazon Crawler

Amazon Crawler

Deprecated

Developed by

Kiryl Surahatau

Kiryl Surahatau

Maintained by Community

Simple actor, which uses the keyword as a search input for Amazon, then crawl the first page including description, link to the item and all offers

0.0 (0)

Pricing

Pay per usage

1

Total users

110

Monthly users

1

Last modified

3 years ago

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY --chown=myuser:myuser . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "0.21.10"
},
"scripts": {
"start": "node main.js"
}
}

main.js

1const Apify = require('apify');
2
3const handleFailedRequestFunction = ({ request }) => {
4 console.log(`Request ${request.url} failed 4 times`);
5};
6
7const baseURL = 'https://amazon.com';
8
9Apify.main(async () => {
10 const input = await Apify.getInput();
11 const dataset = await Apify.openDataset('amazon-dataset');
12 const { keyword, email } = input;
13 if (!keyword || !email) {
14 throw new Error('Please, provide both the keyword and the email');
15 }
16 const enqueueUrl = async (queue, url, userData = {}) => queue.addRequest({ url, userData });
17 const asinQueue = await Apify.openRequestQueue('asin');
18 const descQueue = await Apify.openRequestQueue('desc');
19 const offersQueue = await Apify.openRequestQueue('offers');
20 await enqueueUrl(asinQueue, `${baseURL}/s?k=${keyword}`); // initial url
21
22 const asinCrawler = new Apify.PuppeteerCrawler({
23 requestQueue: asinQueue,
24
25 handlePageFunction: async ({ page, request }) => {
26 console.log(`Request ${request.url} succeeded!`);
27 await Apify.utils.puppeteer.injectJQuery(page);
28 const data = await page.evaluate(() => {
29 const asins = [];
30 $('[data-asin]').each((index, value) => {
31 const asin = value.getAttribute('data-asin');
32 const itemUrl = $(value)
33 .find('a')
34 .attr('href');
35 asins.push({
36 itemUrl,
37 asin,
38 });
39 });
40 return asins;
41 });
42
43 data.forEach((value) => {
44 if (value.itemUrl.match('http')) enqueueUrl(descQueue, value.itemUrl, value);
45 else {
46 enqueueUrl(descQueue, `${baseURL}${value.itemUrl}`, {
47 ...value,
48 itemUrl: `${baseURL}${value.itemUrl}`,
49 });
50 }
51 });
52 },
53
54 // If request failed 4 times then this function is executed.
55 handleFailedRequestFunction,
56 });
57
58 const descCrawler = new Apify.PuppeteerCrawler({
59 requestQueue: descQueue,
60
61 handlePageFunction: async ({ page, request }) => {
62 console.log(`Request ${request.url} succeeded!`);
63 await Apify.utils.puppeteer.injectJQuery(page);
64 const { description, title } = await page.evaluate(() => {
65 const description = $('#productDescription')
66 .find('p')
67 .text()
68 .trim();
69 const titleMatchResult = document.title.match('(?<=:).*?(?=:)');
70 const title = titleMatchResult ? titleMatchResult[0].trim() : document.title.trim(); // removing amazon and category text
71 return { description, title };
72 });
73 enqueueUrl(
74 offersQueue,
75 `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=0`,
76 {
77 ...request.userData,
78 title,
79 description,
80 offers: [], // initialize offers to prevent crash when there are more than one page
81 startIndex: 0,
82 },
83 );
84 },
85 handleFailedRequestFunction,
86 });
87
88 const offersCrawler = new Apify.PuppeteerCrawler({
89 requestQueue: offersQueue,
90 handlePageFunction: async ({ page, request }) => {
91 console.log(`Request ${request.url} succeeded!`);
92 await Apify.utils.puppeteer.injectJQuery(page);
93 const { offersList: offers, isOffersEnded } = await page.evaluate(() => {
94 const offersList = [];
95 $('.olpOffer').each((index, element) => {
96 const sellerName = $(element)
97 .find('.olpSellerName')
98 .text()
99 .trim();
100 const shippingBlock = $(element)
101 .find('.olpShippingInfo')
102 .text()
103 .trim();
104 const shipping = shippingBlock.toLowerCase().match('free')
105 ? 'free'
106 : shippingBlock.replace(/\s+/g, ' ');
107 const offer = $(element)
108 .find('.olpOfferPrice')
109 .text()
110 .trim();
111 offersList.push({ sellerName, shipping, offer });
112 });
113 const isOffersEnded = $('.a-disabled.a-last').length || !$('.a-pagination').length;
114 return { offersList, isOffersEnded };
115 });
116 if (isOffersEnded) {
117 delete request.userData.asin;
118 delete request.userData.startIndex;
119 dataset.pushData({ ...request.userData, keyword, offers: [...request.userData.offers, ...offers] });
120 } else {
121 enqueueUrl(
122 offersQueue,
123 `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=${request
124 .userData.startIndex + 10}`,
125 {
126 ...request.userData,
127 offers: [...request.userData.offers, ...offers],
128 startIndex: request.userData.startIndex + 10,
129 },
130 );
131 }
132 },
133 handleFailedRequestFunction,
134 });
135
136 // Run crawlers.
137 await asinCrawler.run();
138 await descCrawler.run();
139 await offersCrawler.run();
140 const datasetInfo = await dataset.getInfo();
141 // send email
142 await Apify.call('apify/send-mail', {
143 to: email,
144 subject: 'Your data from apify',
145 text: `Here is your data from apify actor, https://api.apify.com/v2/datasets/${
146 datasetInfo.id
147 }/items?format=json`,
148 });
149});