1
2import { Actor } from 'apify';
3
4import { CheerioCrawler, Dataset } from 'crawlee';
5import { PlaywrightCrawler } from 'crawlee';
6import { ProxyConfiguration } from 'apify';
7
8
9
10
11
12await Actor.init();
13
14
15const {
16 pageurl = 'https://www.amazon.in/gp/aod/ajax?asin=B0D945V84N&ref=auto_load_aod&pc=dp',
17 asincode = 'B0B4N77Y34',
18 type = 'asin',
19 service = 'amazon',
20 maxRequestsPerCrawl = 2,
21} = await Actor.getInput() ?? {};
22
23
24const proxyConfiguration = new ProxyConfiguration({
25 groups: ['RESIDENTIAL'],
26 countryCode: 'US',
27
28});
29
30if(type === 'search' && service === 'amazon'){
31 const crawler = new CheerioCrawler({
32 maxRequestRetries: 5,
33 proxyConfiguration,
34 maxRequestsPerCrawl,
35 async requestHandler({ request, $, log }) {
36 log.info('enqueueing new URLs');
37
38 const image = $('#landingImage').attr('src');
39 log.info('Log', { url: request.loadedUrl, image });
40
41 await Dataset.pushData({ image })
42 },
43 });
44
45 await crawler.run([pageurl]);
46}
47
48if(type === 'asin' && service === 'amazon'){
49 const crawler = new CheerioCrawler({
50 maxRequestRetries: 5,
51 proxyConfiguration,
52 maxRequestsPerCrawl,
53 handlePageFunction: ({ proxyInfo }) => {
54 const usedProxyUrl = proxyInfo.url;
55 log.info(usedProxyUrl)
56 },
57 async requestHandler({ request, $, log }) {
58 log.info('enqueueing new URLs');
59
60 const image = $('#aod-asin-image-id').attr('src');
61 log.info('Log', { url: request.loadedUrl, image });
62
63 await Dataset.pushData({ image })
64 },
65 });
66
67 await crawler.run([`https://www.amazon.in/gp/aod/ajax?asin=${asincode}&ref=auto_load_aod&pc=dp`]);
68}
69
70if(type === 'asin' && service === 'amazon_v2'){
71 const crawler = new PlaywrightCrawler({
72 maxRequestRetries: 5,
73 proxyConfiguration,
74 maxRequestsPerCrawl,
75 async requestHandler({ request, $, log }) {
76 log.info('enqueueing new URLs');
77
78 const image = $('#aod-asin-image-id').attr('src');
79 log.info('Log', { url: request.loadedUrl, image });
80
81 await Dataset.pushData({ image })
82 },
83 });
84
85 await crawler.run([`https://www.amazon.in/gp/aod/ajax?asin=${asincode}&ref=auto_load_aod&pc=dp`]);
86}
87
88if(type === 'search' && service === 'flipkart'){
89 const crawler = new CheerioCrawler({
90 maxRequestRetries: 5,
91 proxyConfiguration,
92 maxRequestsPerCrawl,
93 async requestHandler({ request, $, log }) {
94 log.info('enqueueing new URLs');
95
96 const ogImage = $('meta[property="og:image"]').attr('content');
97 const addToCartButton = $('button:contains("Add to cart")');
98 const buyNowButton = $('button:contains("Buy Now")');
99
100 if (ogImage) {
101 log.info('Log', { url: request.loadedUrl });
102
103 await Dataset.pushData({ image: ogImage });
104 } else if(addToCartButton.length == 0 && buyNowButton.length == 0) {
105
106
107 const images = $('div._1YokD2._2GoDe3 > div:nth-child(2) > div:nth-child(2) > div > div:nth-child(1) > div > a._2rpwqI > div:nth-child(1) > div > div > img');
108 const image = images[0];
109
110 const category = $("#container > div > div._36fx1h._6t1WkM._3HqJxg > div._1YokD2._2GoDe3 > div:nth-child(2) > div:nth-child(2) > div > div:nth-child(1) > div > a > div:nth-child(1) > div > div > div > img")
111 const categoryImage = category[0];
112
113 if(image){
114 log.info('Log', { url: request.loadedUrl });
115
116 await Dataset.pushData({ image: image.attribs.src });
117 }else if(categoryImage){
118 log.info('Log', { url: request.loadedUrl });
119
120 await Dataset.pushData({ image: categoryImage.attribs.src });
121 }
122 } else if(addToCartButton.length > 0 && buyNowButton.length > 0){
123
124
125 const image = $('img[loading="eager"]')[0];
126 if(image){
127 log.info('Log', { url: request.loadedUrl });
128
129 await Dataset.pushData({ image: image.attribs.src });
130 }
131 } else {
132
133
134 const favicon = $('link[rel="icon"]').attr('href');
135 if (favicon) {
136 log.info('Log', { url: request.loadedUrl });
137
138 await Dataset.pushData({ image: favicon });
139 }
140 }
141 },
142 });
143
144 await crawler.run([pageurl]);
145}
146
147
148await Actor.exit();