Amazon Crawler avatar

Amazon Crawler

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Amazon Crawler

Amazon Crawler

androlein/amazon-parser

Simple actor, which uses the keyword as a search input for Amazon, then crawl the first page including description, link to the item and all offers

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10"
9    },
10    "scripts": {
11        "start": "node main.js"
12    }
13}

main.js

1const Apify = require('apify');
2
3const handleFailedRequestFunction = ({ request }) => {
4  console.log(`Request ${request.url} failed 4 times`);
5};
6
7const baseURL = 'https://amazon.com';
8
9Apify.main(async () => {
10  const input = await Apify.getInput();
11  const dataset = await Apify.openDataset('amazon-dataset');
12  const { keyword, email } = input;
13  if (!keyword || !email) {
14    throw new Error('Please, provide both the keyword and the email');
15  }
16  const enqueueUrl = async (queue, url, userData = {}) => queue.addRequest({ url, userData });
17  const asinQueue = await Apify.openRequestQueue('asin');
18  const descQueue = await Apify.openRequestQueue('desc');
19  const offersQueue = await Apify.openRequestQueue('offers');
20  await enqueueUrl(asinQueue, `${baseURL}/s?k=${keyword}`); // initial url
21
22  const asinCrawler = new Apify.PuppeteerCrawler({
23    requestQueue: asinQueue,
24
25    handlePageFunction: async ({ page, request }) => {
26      console.log(`Request ${request.url} succeeded!`);
27      await Apify.utils.puppeteer.injectJQuery(page);
28      const data = await page.evaluate(() => {
29        const asins = [];
30        $('[data-asin]').each((index, value) => {
31          const asin = value.getAttribute('data-asin');
32          const itemUrl = $(value)
33            .find('a')
34            .attr('href');
35          asins.push({
36            itemUrl,
37            asin,
38          });
39        });
40        return asins;
41      });
42
43      data.forEach((value) => {
44        if (value.itemUrl.match('http')) enqueueUrl(descQueue, value.itemUrl, value);
45        else {
46          enqueueUrl(descQueue, `${baseURL}${value.itemUrl}`, {
47            ...value,
48            itemUrl: `${baseURL}${value.itemUrl}`,
49          });
50        }
51      });
52    },
53
54    // If request failed 4 times then this function is executed.
55    handleFailedRequestFunction,
56  });
57
58  const descCrawler = new Apify.PuppeteerCrawler({
59    requestQueue: descQueue,
60
61    handlePageFunction: async ({ page, request }) => {
62      console.log(`Request ${request.url} succeeded!`);
63      await Apify.utils.puppeteer.injectJQuery(page);
64      const { description, title } = await page.evaluate(() => {
65        const description = $('#productDescription')
66          .find('p')
67          .text()
68          .trim();
69        const titleMatchResult = document.title.match('(?<=:).*?(?=:)');
70        const title = titleMatchResult ? titleMatchResult[0].trim() : document.title.trim(); // removing amazon and category text
71        return { description, title };
72      });
73      enqueueUrl(
74        offersQueue,
75        `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=0`,
76        {
77          ...request.userData,
78          title,
79          description,
80          offers: [], // initialize offers to prevent crash when there are more than one page
81          startIndex: 0,
82        },
83      );
84    },
85    handleFailedRequestFunction,
86  });
87
88  const offersCrawler = new Apify.PuppeteerCrawler({
89    requestQueue: offersQueue,
90    handlePageFunction: async ({ page, request }) => {
91      console.log(`Request ${request.url} succeeded!`);
92      await Apify.utils.puppeteer.injectJQuery(page);
93      const { offersList: offers, isOffersEnded } = await page.evaluate(() => {
94        const offersList = [];
95        $('.olpOffer').each((index, element) => {
96          const sellerName = $(element)
97            .find('.olpSellerName')
98            .text()
99            .trim();
100          const shippingBlock = $(element)
101            .find('.olpShippingInfo')
102            .text()
103            .trim();
104          const shipping = shippingBlock.toLowerCase().match('free')
105            ? 'free'
106            : shippingBlock.replace(/\s+/g, ' ');
107          const offer = $(element)
108            .find('.olpOfferPrice')
109            .text()
110            .trim();
111          offersList.push({ sellerName, shipping, offer });
112        });
113        const isOffersEnded = $('.a-disabled.a-last').length || !$('.a-pagination').length;
114        return { offersList, isOffersEnded };
115      });
116      if (isOffersEnded) {
117        delete request.userData.asin;
118        delete request.userData.startIndex;
119        dataset.pushData({ ...request.userData, keyword, offers: [...request.userData.offers, ...offers] });
120      } else {
121        enqueueUrl(
122          offersQueue,
123          `https://www.amazon.com/gp/offer-listing/${request.userData.asin}?startIndex=${request
124            .userData.startIndex + 10}`,
125          {
126            ...request.userData,
127            offers: [...request.userData.offers, ...offers],
128            startIndex: request.userData.startIndex + 10,
129          },
130        );
131      }
132    },
133    handleFailedRequestFunction,
134  });
135
136  // Run crawlers.
137  await asinCrawler.run();
138  await descCrawler.run();
139  await offersCrawler.run();
140  const datasetInfo = await dataset.getInfo();
141  // send email
142  await Apify.call('apify/send-mail', {
143    to: email,
144    subject: 'Your data from apify',
145    text: `Here is your data from apify actor, https://api.apify.com/v2/datasets/${
146      datasetInfo.id
147    }/items?format=json`,
148  });
149});
Developer
Maintained by Community
Categories