Amazon Crawler
Deprecated
Pricing
Pay per usage
Go to Store
Amazon Crawler
Deprecated
Simple actor, which uses the keyword as a search input for Amazon, then crawl the first page including description, link to the item and all offers
0.0 (0)
Pricing
Pay per usage
1
Total users
110
Monthly users
1
Last modified
3 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "0.21.10" }, "scripts": { "start": "node main.js" }}
main.js
1const Apify = require('apify');2
3const handleFailedRequestFunction = ({ request }) => {4 console.log(`Request ${request.url} failed 4 times`);5};6
7const baseURL = 'https://amazon.com';8
9Apify.main(async () => {10 const input = await Apify.getInput();11 const dataset = await Apify.openDataset('amazon-dataset');12 const { keyword, email } = input;13 if (!keyword || !email) {14 throw new Error('Please, provide both the keyword and the email');15 }16 const enqueueUrl = async (queue, url, userData = {}) => queue.addRequest({ url, userData });17 const asinQueue = await Apify.openRequestQueue('asin');18 const descQueue = await Apify.openRequestQueue('desc');19 const offersQueue = await Apify.openRequestQueue('offers');20 await enqueueUrl(asinQueue, `${baseURL}/s?k=${keyword}`); // initial url21
22 const asinCrawler = new Apify.PuppeteerCrawler({23 requestQueue: asinQueue,24
25 handlePageFunction: async ({ page, request }) => {26 console.log(`Request ${request.url} succeeded!`);27 await Apify.utils.puppeteer.injectJQuery(page);28 const data = await page.evaluate(() => {29 const asins = [];30 $('[data-asin]').each((index, value) => {31 const asin = value.getAttribute('data-asin');32 const itemUrl = $(value)33 .find('a')34 .attr('href');35 asins.push({36 itemUrl,37 asin,38 });39 });40 return asins;41 });42
43 data.forEach((value) => {44 if (value.itemUrl.match('http')) enqueueUrl(descQueue, value.itemUrl, value);45 else {46 enqueueUrl(descQueue, `${baseURL}${value.itemUrl}`, {47 ...value,48 itemUrl: `${baseURL}${value.itemUrl}`,49 });50 }51 });52 },53
54 // If request failed 4 times then this function is executed.55 handleFailedRequestFunction,56 });57
58 const descCrawler = new Apify.PuppeteerCrawler({59 requestQueue: descQueue,60
61 handlePageFunction: async ({ page, request }) => {62 console.log(`Request ${request.url} succeeded!`);63 await Apify.utils.puppeteer.injectJQuery(page);64 const { description, title } = await page.evaluate(() => {65 const description = $('#productDescription')66 .find('p')67 .text()68 .trim();69 const titleMatchResult = document.title.match('(?<=:).*?(?=:)');70 const title = titleMatchResult ? titleMatchResult[0].trim() : document.title.trim(); // removing amazon and category text71 return { description, title };72 });73 enqueueUrl(74 offersQueue,75 `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=0`,76 {77 ...request.userData,78 title,79 description,80 offers: [], // initialize offers to prevent crash when there are more than one page81 startIndex: 0,82 },83 );84 },85 handleFailedRequestFunction,86 });87
88 const offersCrawler = new Apify.PuppeteerCrawler({89 requestQueue: offersQueue,90 handlePageFunction: async ({ page, request }) => {91 console.log(`Request ${request.url} succeeded!`);92 await Apify.utils.puppeteer.injectJQuery(page);93 const { offersList: offers, isOffersEnded } = await page.evaluate(() => {94 const offersList = [];95 $('.olpOffer').each((index, element) => {96 const sellerName = $(element)97 .find('.olpSellerName')98 .text()99 .trim();100 const shippingBlock = $(element)101 .find('.olpShippingInfo')102 .text()103 .trim();104 const shipping = shippingBlock.toLowerCase().match('free')105 ? 'free'106 : shippingBlock.replace(/\s+/g, ' ');107 const offer = $(element)108 .find('.olpOfferPrice')109 .text()110 .trim();111 offersList.push({ sellerName, shipping, offer });112 });113 const isOffersEnded = $('.a-disabled.a-last').length || !$('.a-pagination').length;114 return { offersList, isOffersEnded };115 });116 if (isOffersEnded) {117 delete request.userData.asin;118 delete request.userData.startIndex;119 dataset.pushData({ ...request.userData, keyword, offers: [...request.userData.offers, ...offers] });120 } else {121 enqueueUrl(122 offersQueue,123 `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=${request124 .userData.startIndex + 10}`,125 {126 ...request.userData,127 offers: [...request.userData.offers, ...offers],128 startIndex: request.userData.startIndex + 10,129 },130 );131 }132 },133 handleFailedRequestFunction,134 });135
136 // Run crawlers.137 await asinCrawler.run();138 await descCrawler.run();139 await offersCrawler.run();140 const datasetInfo = await dataset.getInfo();141 // send email142 await Apify.call('apify/send-mail', {143 to: email,144 subject: 'Your data from apify',145 text: `Here is your data from apify actor, https://api.apify.com/v2/datasets/${146 datasetInfo.id147 }/items?format=json`,148 });149});