Mironet Scraper
View all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsMironet Scraper
petr_cermak/mironet-scraper
Scrapes all Mironet.cz products.
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10"
9 },
10 "scripts": {
11 "start": "node main.js"
12 }
13}
main.js
1const Apify = require('apify');
2
3/**
4 * Gets attribute as text from a ElementHandle.
5 * @param {ElementHandle} element - The element to get attribute from.
6 * @param {string} attr - Name of the attribute to get.
7 */
8async function getAttribute(element, attr){
9 try{
10 const prop = await element.getProperty(attr);
11 return (await prop.jsonValue()).trim();
12 }
13 catch(e){return null;}
14}
15
16/** Main function */
17Apify.main(async () => {
18
19 // Open request queue and add statrUrl
20 const requestQueue = await Apify.openRequestQueue();
21 await requestQueue.addRequest(new Apify.Request({
22 userData: {label: 'start'},
23 url: 'https://www.mironet.cz/'
24 }));
25
26 // Disable all non-essential requests
27 const gotoFunction = async ({ page, request }) => {
28 await page.setRequestInterception(true);
29 page.on('request', intercepted => {
30 const type = intercepted.resourceType();
31 if(type === 'image' || type === 'stylesheet'){intercepted.abort();}
32 else{intercepted.continue();}
33 });
34 await Apify.utils.puppeteer.hideWebDriver(page);
35 return await page.goto(request.url, {timeout: 200000});
36 };
37
38 // Handle page context
39 const handlePageFunction = async ({ page, request }) => {
40
41 // This is the start page
42 if(request.userData.label === 'start'){
43
44 // Enqueue category links
45 await Apify.utils.puppeteer.enqueueLinks({
46 page: page,
47 requestQueue: requestQueue,
48 selector: '.nadpis a',
49 pseudoUrls: null,
50 userData: {label: 'page'}
51 });
52 }
53
54 // This is the category page
55 else if(request.userData.label === 'page'){
56
57 // Enqueue pagination pages
58 await Apify.utils.puppeteer.enqueueLinks({
59 page: page,
60 requestQueue: requestQueue,
61 selector: 'a.PageNew',
62 pseudoUrls: null,
63 userData: {label: 'page'}
64 });
65
66 // Iterate all products and extract data
67 const items = await page.$$('.item_b');
68 for(const item of items){
69 const toNumber = p => p.replace(/\s/g, '').match(/\d+/)[0];
70 try{
71 const idElem = await item.$('.item_kod');
72 const linkElem = await item.$('.nazev a');
73 const priceElem = await item.$('.item_cena');
74 const imgElem = await item.$('.item_obr img');
75 const oPriceElem = await item.$('.item_s_cena span');
76 const img = await getAttribute(imgElem, 'src');
77 const link = await getAttribute(linkElem, 'href');
78 const id = await getAttribute(idElem, 'textContent');
79 const name = await getAttribute(linkElem, 'textContent');
80 const price = await getAttribute(priceElem, 'textContent');
81 const dataItem = {
82 img: img,
83 itemId: id.match(/\d+/)[0],
84 itemUrl: link,
85 itemName: name,
86 discounted: oPriceElem ? true : false,
87 currentPrice: price ? toNumber(price) : null
88 };
89 if(oPriceElem){
90 const oPrice = await getAttribute(oPriceElem, 'textContent');
91 dataItem.originalPrice = toNumber(oPrice);
92 }
93
94 // Save data to dataset
95 await Apify.pushData(dataItem);
96 }
97 catch(e){console.log(e);}
98 }
99 }
100 };
101
102 // Create crawler
103 const crawler = new Apify.PuppeteerCrawler({
104 requestQueue,
105
106 // Use proxy
107 launchPuppeteerOptions: {
108 useApifyProxy: true
109 },
110
111 gotoFunction,
112
113 handlePageFunction,
114
115 // If request failed 4 times then this function is executed
116 handleFailedRequestFunction: async ({ request }) => {
117 console.log(`Request ${request.url} failed 4 times`);
118 },
119 });
120
121 // Run crawler
122 await crawler.run();
123});
Developer
Maintained by Community
Categories